/
exploration.R
86 lines (65 loc) · 2.57 KB
/
exploration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# This script explores the DrivenData Blood Donation competition data set
# The data is available at http://www.drivendata.org/competitions/2/data/
# Loading the data
library('readr')
library('corrplot')
setwd("your working directory")
train <- read_csv('../data/ddblood_train.csv')
test <- read_csv('../data/ddblood_test.csv')
# A first look at the train set:
print(str(train))
# Renaming columns
colnames(train) <- c("Id","Recency", "Frequency", "Monetary", "Time", "C")
colnames(test) <- c("Id","Recency", "Frequency", "Monetary", "Time")
# Splitting the data
y <- train$C
train <- train[,c("Recency", "Frequency", "Time") ,drop=FALSE]
test <- test[ ,c("Recency", "Frequency", "Time") ,drop=FALSE]
print(str(train))
# Train / Test comparison
# Figure 1 shows boxplots of the features for the train and test sets.
# We can see that the train and test sets have very similar distributions.
# Some outliers (Recency > 30 or Frequency > 20) are clearly visible.
par(bty = 'n')
boxplot(train,
boxwex = 0.28, at = 1:3 - 0.2,
col = "light blue",
main = "Blood Donation",
xlab = "", ylab = "", cex.axis=0.85, bty="n", xaxt="n", yaxt="n")
boxplot(test, add = TRUE,
boxwex = 0.28, at = 1:3 + 0.2,
col = "bisque", cex.axis=0.85, xaxt="n")
legend(1.5, 80, c("Train", "Test"),
fill = c("light blue", "bisque"))
axis(1,side = 1, at=c(1,2,3),labels=c('Recency','Frequency','Time'),tick=FALSE)
# Check the outliers fo train$Recency
index <- train$Recency > 30
print(train[index, ])
print(y[index])
# Check the outliers for train$Frequency
index <- train$Frequency > 30
print(train[index, ])
print(y[index])
# Donators vs non Donators comparison
donors <- y == 1
par(bty = 'n')
boxplot(train[donors,],
boxwex = 0.18, at = 1:3 - 0.2,
col = "light blue",
main = "Blood Donation - March Donation vs March No Donation",
xlab = "", ylab = "", cex.axis=0.85, bty="n", xaxt="n", yaxt="n")
boxplot(train[!donors,], add = TRUE,
boxwex = 0.38, at = 1:3 + 0.2,
col = "bisque", cex.axis=0.85, xaxt="n")
legend(1, 95, c("Donation in March", "No Donation in March"),
fill = c("light blue", "bisque"))
axis(1,side = 1, at=c(1,2,3),labels=c('Recency','Frequency','Time'),tick=FALSE)
# Correlation
corrplot.mixed(cor(train), lower = "ellipse", upper="number")
# Multiple donations in the same month?
# more than once
index <- (train$Recency == train$Time) & train$Frequency > 1
print(dim(train[index,])) # 26, 3
# more than twice
index <- (train$Recency == train$Time) & train$Frequency > 2
print(dim(train[index,])) # 3, 3