Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
223 lines (177 sloc) 7.06 KB
### Full code for the "Don't get lost in a forest" post on rDisorder.eu
# Install needed packages
trees_packages <- c("FFTrees",
"party",
"randomForest",
"intubate",
"dplyr",
"gbm")
install.packages(trees_packages)
# Get the data (folder "data" in this repo) and load it.
# I always avoid to convert all strings to factors
titanic <- read.csv("https://raw.githubusercontent.com/alanmarazzi/trees-forest/master/data/train.csv",
stringsAsFactors = FALSE,
na.strings = "")
# Take a look at the structure of the dataset
str(titanic)
# The first thing I like to do is to convert all columns names to lowercase
names(titanic) <- tolower(names(titanic))
# sex and embarked are actually factors, so let's convert them
titanic$sex <- as.factor(titanic$sex)
titanic$embarked <- as.factor(titanic$embarked)
# The age variable has some missing values, 19.86%
mean(is.na(titanic$age))
# Deal with NAs in age variable by substituting them with a linear regression
age_prediction <- lm(age ~ survived + pclass + fare, data = titanic)
summary(age_prediction)
# Now substitute NAs in the dataset
titanic$age[is.na(titanic$age)] <- predict(age_prediction,
newdata = titanic[is.na(titanic$age),])
# Check NAs in age
sum(is.na(titanic$age))
### Remove variables that clearly have nothing to do with our prediction setting
### and run a logistic regression as a benchmark
library(dplyr)
library(intubate)
# Instead of creating a temp variable pipe the dataset to the model
logi <- titanic %>%
select(survived, pclass, sex, age, sibsp) %>%
ntbt_glm(survived ~ ., family = binomial)
# Always check statistics
summary(logi)
# Predict on training and test set
logi_pred <- predict(logi, type = "response")
survivors_logi <- rep(0, nrow(titanic))
survivors_logi[logi_pred > .5] <- 1
# This is going to be our training benchmark
table(model = survivors_logi, real = titanic$survived)
# Now on the test set for submission on Kaggle
test <- read.csv("https://raw.githubusercontent.com/alanmarazzi/trees-forest/master/data/test.csv",
stringsAsFactors = FALSE,
na.strings = "")
# Remember to apply the same transformations as for the training set
names(test) <- tolower(names(test))
test$sex <- as.factor(test$sex)
# Prediction of the test set and write results to csv for submission
test_logi_pred <- predict(logi, test, type = "response")
surv_test_logi <- data.frame(PassengerId = test$passengerid,
Survived = rep(0, nrow(test)))
surv_test_logi$Survived[test_logi_pred > .5] <- 1
table(surv_test_logi$Survived)
write.csv(surv_test_logi, "results/logi.csv", row.names = FALSE)
### Result is 0.77512
### Fast and Frugal Trees
# Temporary copy to avoid masking from FFTrees package
titanicc <- titanic
library(FFTrees)
titanic <- titanicc
rm(titanicc)
# Fast and Frugal Trees model building
fftitanic <- titanic %>%
select(age, pclass, sex, sibsp, fare, survived) %>%
ntbt(FFTrees, survived ~ .)
# Plotting of the best tree
plot(fftitanic,
main = "Titanic",
decision.names = c("Not Survived", "Survived"))
# Build a simple classifier out of the best tree
ffpred <- ifelse(test$sex != "male", 1,
ifelse(test$pclass > 2, 0,
ifelse(test$fare < 26.96, 0,
ifelse(test$age >= 21.36, 0, 1))))
# FFTree doesn't deal with NAs, I assign a 0 to them
ffpred[is.na(ffpred)] <- 0
ffpred <- data.frame(PassengerId = test$passengerid, Survived = ffpred)
write.csv(ffpred, "results/fftree.csv", row.names = FALSE)
### Result is 0.76555
### Inferential trees
library(party)
# ctree model building
partyTitanic <- titanic %>%
select(age, pclass, sex, sibsp, fare, survived) %>%
ntbt(ctree, as.factor(survived) ~ .)
# Plot the resulting tree
plot(partyTitanic, main = "Titanic prediction", type = "simple",
inner_panel = node_inner(partyTitanic,
pval = FALSE),
terminal_panel = node_terminal(partyTitanic,
abbreviate = TRUE,
digits = 1,
fill = "white"))
# Prediction of training data
train_party <- Predict(partyTitanic)
table(tree = train_party, real = titanic$survived)
# Prediction of the test set
party_pred <- Predict(partyTitanic, newdata = test)
party_pred <- as.numeric(party_pred) - 1
party_pred <- data.frame(PassengerId = test$passengerid,
Survived = party_pred)
write.csv(party_pred, "results/party.csv", row.names = FALSE)
### Result is 0.73684
### Bagging
library(randomForest)
# If you want the same result remember to set the same seed
set.seed(123)
# Bagging model building
titanic_bag <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt_randomForest(as.factor(survived) ~ ., mtry = 6)
# Bagging and Random Forest don't deal with NAs
test$age[is.na(test$age)] <- median(test$age, na.rm = TRUE)
# The usual test set prediction
bag_pred <- predict(titanic_bag, test)
# Check if there are NAs in prediction and substitute them
sum(is.na(bag_pred))
bag_pred[is.na(bag_pred)] <- 1
bag_pred <- data.frame(PassengerId = test$passengerid,
Survived = bag_pred,
row.names = 1:length(bag_pred))
write.csv(bag_pred, "results/bagging.csv", row.names = FALSE)
### Result is 0.66507
### RandomForest
set.seed(456)
# Random Forest model building
titanic_rf <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt_randomForest(as.factor(survived) ~ ., mtry = 3, ntree = 5000)
# Prediction
rf_pred <- predict(titanic_rf, test)
rf_pred[is.na(rf_pred)] <- 1
rf_pred <- data.frame(PassengerId = test$passengerid, Survived = rf_pred)
write.csv(rf_pred, "results/rf.csv", row.names = FALSE)
### Result is 0.74641
### RandomForest with inferential trees
set.seed(415)
# Use the cforest function from party package
titanic_rf_party <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt(cforest, as.factor(survived) ~ .,
controls = cforest_unbiased(ntree = 5000, mtry = 3))
# Prediction of the test set
rf_party_pred <- predict(titanic_rf_party,
test,
OOB = TRUE,
type = "response")
rf_party_pred <- data.frame(PassengerId = test$passengerid,
Survived = rf_party_pred)
write.csv(rf_party_pred, "results/rf_party.csv", row.names = FALSE)
### Result is 0.77033
### Boosting
library(gbm)
# Set the seed for reproducibility
set.seed(999)
# Boosting model building
titanic_boost <- titanic %>%
select(survived, age, pclass, sex, sibsp, fare, parch) %>%
ntbt(gbm, survived ~ .,
distribution = "bernoulli",
n.trees = 5000,
interaction.depth = 3)
# Boosting prediction
boost_pred <- predict(titanic_boost, test, n.trees = 5000, type = "response")
test_boost <- rep(0, nrow(test))
test_boost[boost_pred >= .5] <- 1
test_boost <- data.frame(PassengerId = test$passengerid,
Survived = test_boost)
write.csv(test_boost, "results/test_boost.csv", row.names = FALSE)
### Result is 0.76077