In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


import pandas as pd
from sklearn import model_selection,  ensemble

In [3]:
df_train = pd.read_csv('titanic/train.csv')
df_test = pd.read_csv('titanic/test.csv')

In [4]:
def clean(data):    
    data.drop("Embarked", axis = 1, inplace = True) # dropped entirely because it does not seem to help much
    
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median()) # fill missing areas with average values
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median()) # fill missing areas with average values
    
    data.loc [data["Sex"] == "female", "Sex"] = 1 # convert to ints
    data.loc [data["Sex"] == "male", "Sex"] = 0 # convert to ints

In [5]:
clean(df_train)
clean(df_test)

In [6]:
# necessary to generate a compatible csv file for kaggle
def write_prediction(prediction, name):
    PassengerId = np.array(df_test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [7]:
goal = df_train["Survived"].values
variables = df_train[["Sex", "Age", "Pclass", "Fare", "SibSp", "Parch"]].values # pass in the data variables

In [8]:
rand_forest = ensemble.RandomForestClassifier(
    n_jobs = -1, # use all processors for fitting and predicting 
    max_depth = 6, # max depth of tree based on data variables 
    random_state = 42, # has to be 42 or else 
    min_samples_split = 4, # minimum samples needed to split a node
    n_estimators = 1000 # number of trees in forest
)

In [9]:
# perform training and save the results to a .csv file
rand_forest = rand_forest.fit(variables, goal)
variable_test = df_test[["Sex", "Age", "Pclass", "Fare", "SibSp", "Parch"]].values
prediction_forest = rand_forest.predict(variable_test)
write_prediction(prediction_forest, "results/results.csv")

In [10]:
print(rand_forest.feature_importances_) #determine most valuable data variables
#Sex,         Age,      Pclass,    Fare,      SibSp,      Parch

[0.44709541 0.13409704 0.1399686  0.1879372  0.0544267  0.03647505]


In [12]:
print(rand_forest.score(variables, goal)) #score of the models predictions

0.8664421997755332


In [23]:
score = model_selection.cross_val_score(rand_forest, variables, goal, scoring='accuracy', cv=8)
#cross validation score

In [24]:
print(score)

[0.78571429 0.77678571 0.83928571 0.875      0.8125     0.79279279
 0.8        0.87272727]


In [18]:
print(score.mean())

0.8272494609011464
