Get that importion

In [14]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os


# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


import pandas as pd
from sklearn import model_selection,  ensemble

In [15]:
df_train = pd.read_csv('titanic/train.csv')
df_test = pd.read_csv('titanic/test.csv')

In [16]:
def clean(data):    
    data.drop("Embarked", axis = 1, inplace = True) # dropped entirely because it does not seem to help much
    
    data["Age"] = data["Age"].fillna(data["Age"].dropna().median()) # fill missing areas with average values
    data["Fare"] = data["Fare"].fillna(data["Fare"].dropna().median()) # fill missing areas with average values
    
    data.loc [data["Sex"] == "female", "Sex"] = 1 # convert to ints
    data.loc [data["Sex"] == "male", "Sex"] = 0 # convert to ints

Drop the Embarked data because it had missing values and the weight of the feature is low. Filled in the Age and Fare datas' missing rows with the median of the columns' values. For sex male and female were converted to 0 and 1 to allow for computation in algorithms. 

In [17]:
clean(df_train)
clean(df_test)

In [18]:
# necessary to generate a compatible csv file for kaggle
def write_prediction(prediction, name):
    PassengerId = np.array(df_test["PassengerId"]).astype(int)
    solution = pd.DataFrame(prediction, PassengerId, columns = ["Survived"])
    solution.to_csv(name, index_label = ["PassengerId"])

In [19]:
goal = df_train["Survived"].values
variables = df_train[["Sex", "Age", "Pclass", "Fare", "SibSp", "Parch"]].values # pass in the data variables

Base our training off of these variables. Sex > Fare > Pclass > Age > SibSp > Parch

In [20]:
rand_forest = ensemble.RandomForestClassifier(
    n_jobs = -1, # use all processors for fitting and predicting 
    max_depth = 6, # max depth of tree based on data variables 
    random_state = 42, # has to be 42 or else 
    min_samples_split = 4, # minimum samples needed to split a node
    n_estimators = 1000, # number of trees in forest
    criterion = 'entropy' #turns out this is way better than gini (shoutout to Raj)
    
)

Random Forest is a collection of decision trees. Better at accounting for overfitting than decision trees.  

In [21]:
# perform training and save the results to a .csv file
rand_forest = rand_forest.fit(variables, goal)
variable_test = df_test[["Sex", "Age", "Pclass", "Fare", "SibSp", "Parch"]].values
prediction_forest = rand_forest.predict(variable_test)
write_prediction(prediction_forest, "results/results.csv")

In [22]:
print(rand_forest.feature_importances_) #determine most valuable data variables
#Sex,         Age,      Pclass,    Fare,      SibSp,      Parch

[0.41312112 0.1475431  0.13820872 0.2048374  0.05821531 0.03807434]


In [23]:
print(rand_forest.score(variables, goal)) #score of the models predictions

0.856341189674523


In [24]:
score = model_selection.cross_val_score(rand_forest, variables, goal, scoring='accuracy', cv=8)
#cross validation score

In [25]:
print(score)

[0.79464286 0.77678571 0.83928571 0.875      0.8125     0.79279279
 0.80909091 0.86363636]


In [26]:
print(score.mean())

0.8204667939042939
