/
titanic.R
223 lines (177 loc) · 7.06 KB
/
titanic.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
### Full code for the "Don't get lost in a forest" post on rDisorder.eu
# Install needed packages
trees_packages <- c("FFTrees",
"party",
"randomForest",
"intubate",
"dplyr",
"gbm")
install.packages(trees_packages)
# Get the data (folder "data" in this repo) and load it.
# I always avoid to convert all strings to factors
titanic <- read.csv("https://raw.githubusercontent.com/alanmarazzi/trees-forest/master/data/train.csv",
stringsAsFactors = FALSE,
na.strings = "")
# Take a look at the structure of the dataset
str(titanic)
# The first thing I like to do is to convert all columns names to lowercase
names(titanic) <- tolower(names(titanic))
# sex and embarked are actually factors, so let's convert them
titanic$sex <- as.factor(titanic$sex)
titanic$embarked <- as.factor(titanic$embarked)
# The age variable has some missing values, 19.86%
mean(is.na(titanic$age))
# Deal with NAs in age variable by substituting them with a linear regression
age_prediction <- lm(age ~ survived + pclass + fare, data = titanic)
summary(age_prediction)
# Now substitute NAs in the dataset
titanic$age[is.na(titanic$age)] <- predict(age_prediction,
newdata = titanic[is.na(titanic$age),])
# Check NAs in age
sum(is.na(titanic$age))
### Remove variables that clearly have nothing to do with our prediction setting
### and run a logistic regression as a benchmark
library(dplyr)
library(intubate)
# Instead of creating a temp variable pipe the dataset to the model
logi <- titanic %>%
select(survived, pclass, sex, age, sibsp) %>%
ntbt_glm(survived ~ ., family = binomial)
# Always check statistics
summary(logi)
# Predict on training and test set
logi_pred <- predict(logi, type = "response")
survivors_logi <- rep(0, nrow(titanic))
survivors_logi[logi_pred > .5] <- 1
# This is going to be our training benchmark
table(model = survivors_logi, real = titanic$survived)
# Now on the test set for submission on Kaggle
test <- read.csv("https://raw.githubusercontent.com/alanmarazzi/trees-forest/master/data/test.csv",
stringsAsFactors = FALSE,
na.strings = "")
# Remember to apply the same transformations as for the training set
names(test) <- tolower(names(test))
test$sex <- as.factor(test$sex)
# Prediction of the test set and write results to csv for submission
test_logi_pred <- predict(logi, test, type = "response")
surv_test_logi <- data.frame(PassengerId = test$passengerid,
Survived = rep(0, nrow(test)))
surv_test_logi$Survived[test_logi_pred > .5] <- 1
table(surv_test_logi$Survived)
write.csv(surv_test_logi, "results/logi.csv", row.names = FALSE)
### Result is 0.77512
### Fast and Frugal Trees
# Temporary copy to avoid masking from FFTrees package
titanicc <- titanic
library(FFTrees)
titanic <- titanicc
rm(titanicc)
# Fast and Frugal Trees model building
fftitanic <- titanic %>%
select(age, pclass, sex, sibsp, fare, survived) %>%
ntbt(FFTrees, survived ~ .)
# Plotting of the best tree
plot(fftitanic,
main = "Titanic",
decision.names = c("Not Survived", "Survived"))
# Build a simple classifier out of the best tree
ffpred <- ifelse(test$sex != "male", 1,
ifelse(test$pclass > 2, 0,
ifelse(test$fare < 26.96, 0,
ifelse(test$age >= 21.36, 0, 1))))
# FFTree doesn't deal with NAs, I assign a 0 to them
ffpred[is.na(ffpred)] <- 0
ffpred <- data.frame(PassengerId = test$passengerid, Survived = ffpred)
write.csv(ffpred, "results/fftree.csv", row.names = FALSE)
### Result is 0.76555
### Inferential trees
library(party)
# ctree model building
partyTitanic <- titanic %>%
select(age, pclass, sex, sibsp, fare, survived) %>%
ntbt(ctree, as.factor(survived) ~ .)
# Plot the resulting tree
plot(partyTitanic, main = "Titanic prediction", type = "simple",
inner_panel = node_inner(partyTitanic,
pval = FALSE),
terminal_panel = node_terminal(partyTitanic,
abbreviate = TRUE,
digits = 1,
fill = "white"))
# Prediction of training data
train_party <- Predict(partyTitanic)
table(tree = train_party, real = titanic$survived)
# Prediction of the test set
party_pred <- Predict(partyTitanic, newdata = test)
party_pred <- as.numeric(party_pred) - 1
party_pred <- data.frame(PassengerId = test$passengerid,
Survived = party_pred)
write.csv(party_pred, "results/party.csv", row.names = FALSE)
### Result is 0.73684
### Bagging
library(randomForest)
# If you want the same result remember to set the same seed
set.seed(123)
# Bagging model building
titanic_bag <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt_randomForest(as.factor(survived) ~ ., mtry = 6)
# Bagging and Random Forest don't deal with NAs
test$age[is.na(test$age)] <- median(test$age, na.rm = TRUE)
# The usual test set prediction
bag_pred <- predict(titanic_bag, test)
# Check if there are NAs in prediction and substitute them
sum(is.na(bag_pred))
bag_pred[is.na(bag_pred)] <- 1
bag_pred <- data.frame(PassengerId = test$passengerid,
Survived = bag_pred,
row.names = 1:length(bag_pred))
write.csv(bag_pred, "results/bagging.csv", row.names = FALSE)
### Result is 0.66507
### RandomForest
set.seed(456)
# Random Forest model building
titanic_rf <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt_randomForest(as.factor(survived) ~ ., mtry = 3, ntree = 5000)
# Prediction
rf_pred <- predict(titanic_rf, test)
rf_pred[is.na(rf_pred)] <- 1
rf_pred <- data.frame(PassengerId = test$passengerid, Survived = rf_pred)
write.csv(rf_pred, "results/rf.csv", row.names = FALSE)
### Result is 0.74641
### RandomForest with inferential trees
set.seed(415)
# Use the cforest function from party package
titanic_rf_party <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt(cforest, as.factor(survived) ~ .,
controls = cforest_unbiased(ntree = 5000, mtry = 3))
# Prediction of the test set
rf_party_pred <- predict(titanic_rf_party,
test,
OOB = TRUE,
type = "response")
rf_party_pred <- data.frame(PassengerId = test$passengerid,
Survived = rf_party_pred)
write.csv(rf_party_pred, "results/rf_party.csv", row.names = FALSE)
### Result is 0.77033
### Boosting
library(gbm)
# Set the seed for reproducibility
set.seed(999)
# Boosting model building
titanic_boost <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt(gbm, survived ~ .,
distribution = "bernoulli",
n.trees = 5000,
interaction.depth = 3)
# Boosting prediction
boost_pred <- predict(titanic_boost, test, n.trees = 5000, type = "response")
test_boost <- rep(0, nrow(test))
test_boost[boost_pred >= .5] <- 1
test_boost <- data.frame(PassengerId = test$passengerid,
Survived = test_boost)
write.csv(test_boost, "results/test_boost.csv", row.names = FALSE)
### Result is 0.76077