-
Notifications
You must be signed in to change notification settings - Fork 0
/
drugs_analysis_code.R
80 lines (71 loc) · 4.51 KB
/
drugs_analysis_code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#Q1
Drugs <- read.csv("Drugs.txt", header = T, ) #load data from file "Drugs.txt" in dataframe "Drugs"
Drugs$FLAG_CODES <- NULL #remove the variable "FLAG_CODES"
#Q2
length(unique(Drugs$LOCATION)) #number of countries
t <- table(Drugs$LOCATION) #create a table with countries' acronyms and datapoints (years) per country
t <- t[order(t)] #sort the table in ascending order
#Q3
#the countries in top 25%, belongs to the 75% of the distribution based upon the number of datapoints that can be found in the table "t"
q <- quantile(t, p=0.75)
r <- row.names(as.table(which(t>= q))) #get the names of the countries stored in table "t" and are above the quantile
s <- subset(Drugs, Drugs$LOCATION %in% r) #show the available information of the selected countries
#Q4
x1 <- s$TIME #create a vector with the column "TIME" in "s" table
loc <- unique(s$LOCATION)
c <- colnames(s)
par(mfrow=c(5,1), mar = c(1,4,3,1), las = 1) #combine 5 plots in the same window (the 5th is the legend)
for (i in seq(3, length(c), by=1)) { #in every loop a plot that referred to a metric is created
plot(x1, s[,c[i]], xlab = "Years", ylab = c[i], main = paste("Drug spending in", c[i], sep = " "))
axis(1, at=c(min(x1), max(x1)), lab=c(min(x1), max(x1))) #define the x-axis limits
for (j in 1:length(loc)) { #in every loop each country takes a different color
lines(x1[s$LOCATION==loc[j]], s[,c[i]][s$LOCATION==loc[j]], col = j, type = "b", pch = 16)
}
}
#create the legend (as a plot)
par(mai=c(0,0,0,0))
plot.new()
legend("center", legend = loc[1:length(loc)], box.lty = 1, lty = 1, col = c(1:length(loc)), bg = "transparent", horiz = TRUE, cex = 0.75)
#Q5
bel <- Drugs[(Drugs$LOCATION == "BEL"),] #data frame "bel" contains all the data concerning the Belgium
now <- max(bel$TIME) + 1 #the year we are now
yprobs <- list() #initializes the list "yprobs"
for (i in seq(3, length(colnames(bel)), by=1)) { #loop to calculate probability starting the iterator from the column "PC_HEALTHXP"
d <- length(which(diff(bel[,colnames(bel)[i]]) > 0)) #calculate how many years each metric is higher than the year before
yprobs <- append(yprobs, (d/length(bel$LOCATION))) #append to the "yprobs" list the probability of drug expenditure increase for each metric of the wanted country
}
yprobs <- as.numeric(yprobs[1:4]) #convert the list into a numeric vector
names(yprobs) <- colnames(bel)[3:6] #name the variables on the vector "yprobs" as the columns of Belgium's data frame
fiveprobs <- dbinom(4, 5, yprobs[1:4]) #calculate the probability of increase in the next 5 years with the Binomial distribution [we use this distribution because the solution of the problem is based upon two outcomes (increase or decrease of drug expenditure)]
#create the requested list
belist <- list(bel, c(min(bel$TIME), max(bel$TIME)), length(unique(bel$TIME)), yprobs, fiveprobs)
names(belist) <- c("Data", "Years", "Data.points", "YearlyProbs", "FiveYeProbs")
#Q6
#create the requested function which is called "calc"
calc <- function(DATA = NULL, METRIC = "pc.gdp", nofY = 5){
if("FLAG_CODES" %in% colnames(DATA) | length(unique(DATA$LOCATION)) > 1 | is.null(DATA)){ #we take as constraint and the possibility that the data frame will be null, because in that case we can't move on
return()
}
if(METRIC == "pc.gdp") {
metric <- colnames(DATA)[3]
} else if (METRIC == "pc.tot"){
metric <- colnames(DATA)[4]
} else if (METRIC == "per.ca"){
metric <- colnames(DATA)[5]
} else if(METRIC == "total"){
metric <- colnames(DATA)[6]
} else {
return()
}
if (nofY != 5){
print(paste("The probability of increasing drug expenditure in at least", nofY-1, "out of the following", nofY, "years.", sep = " "))
}
#calculate the probability as we did in the previous question
p <- length(which(diff(DATA[,metric]) > 0))/length(DATA[,metric])
prob <- dbinom(nofY-1, nofY, p)
if (length(unique(DATA$TIME)) < 10){
return("Unable to calculate probability (n< 10)")
} else{
return(paste("Based on", length(unique(DATA$TIME)), "datapoints from years", min(DATA$TIME), "to", max(DATA$TIME), "the probability that", DATA$LOCATION[1], "will increase its drug expenditure, in terms of", metric, "in at least", nofY-1, "years in the period", max(DATA$TIME), "to", max(DATA$TIME)+1+nofY, "is", prob, ".", sep = " "))
}
}