In [2]:
# import general purpose libraries


import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
# from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


%matplotlib inline

In [3]:
# read in the data
train_data = pd.read_csv("../../datasets/titanic/train.csv")
test_data = pd.read_csv("../../datasets/titanic/test.csv")

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


So immediately, we can notice quite a few things here:
* We are predicting the variable `Survived`, which is a binary 1/0 classification
* We have a few different classes on the boat
* The names are kinda irregular, we probably want to do something here
* Tickets have irregular formatting
* Cabins have missing values

In short, it looks like we will have some feature engineering to do in order to model this.

Let's take a look at the distribution of survival

In [6]:
train_data.groupby('Survived').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


So we have 549 that are class 0 (presumably, not survived) and 342 class 1 (presumably, survived). We are probably okay in saying that we have enough of each class that we don't have to do anything special to have enough of each class to make accurate predictions. Let's do a little bit more exploratory analysis to figure out a solid baseline, to figure out if a model presents any real value.

In [7]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
train_data.groupby(['Survived', 'Sex']).count()['PassengerId']

Survived  Sex   
0         female     81
          male      468
1         female    233
          male      109
Name: PassengerId, dtype: int64

In [9]:
train_data.groupby(['Survived', 'Pclass']).count()['PassengerId']

Survived  Pclass
0         1          80
          2          97
          3         372
1         1         136
          2          87
          3         119
Name: PassengerId, dtype: int64

In [10]:
train_data.groupby(['Survived', 'Embarked']).count()['PassengerId']

Survived  Embarked
0         C            75
          Q            47
          S           427
1         C            93
          Q            30
          S           217
Name: PassengerId, dtype: int64

It looks like a pretty clear division for survival is the sex of the passenger, as females look like they fared much better. I also may have seen a few notebooks on this dataset and know that this tends to be a pretty standard division...

Let's write a function for our baseline of predicting a 1 for female, and 0 for male. Also, we can assess the accuracy and/or any other metric 

In [11]:
# note that you need to turn the pandas dataframe into a numpy array
train_data_array = train_data.values
train_data_array.shape
predictions = np.array([train_data_array[x][4] == "female" for x in range(train_data_array.shape[0])]).astype("int")
baseline_accuracy = accuracy_score(y_pred=predictions, y_true=train_data.Survived)
print("Baseline results:")
print(baseline_accuracy)
print(confusion_matrix(y_pred=predictions, y_true=train_data.Survived))

Baseline results:
0.7867564534231201
[[468  81]
 [109 233]]


Okay, so with our current train/test split, we see that we are 78.7% accurate if we just guess based on gender. So if anything isn't better than this, our model is pretty much useless. We can note here that we have more false positives than false negatives.

With a baseline under our belt, we can look into doing some feature engineering for machine learning.

The first thing we will do is take a look into the tickets, and see if there appears to be any sort of patterns here.

In [12]:
train_data.Ticket

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
5                330877
6                 17463
7                349909
8                347742
9                237736
10              PP 9549
11               113783
12            A/5. 2151
13               347082
14               350406
15               248706
16               382652
17               244373
18               345763
19                 2649
20               239865
21               248698
22               330923
23               113788
24               349909
25               347077
26                 2631
27                19950
28               330959
29               349216
             ...       
861               28134
862               17466
863            CA. 2343
864              233866
865              236852
866       SC/PARIS 2149
867            PC 17590
868              345777
869              347742
870              349248
871             

It looks all of these have one numeric term, and then one optional character string. We can separate these out into two vectors. The first one will be binary for the presence of the character string, and the second will just have the number of digits of the numeric part. I'm operating under the assumption that each individual ticket name shouldn't be treated as a numeric and they all don't mean anything.

In [13]:
ticket_array = train_data.Ticket 
ticket_array = ticket_array.astype("str")

def return_splits(single_string):

    if len(single_string.split(" ")) == 1:
        ticket_prefix, ticket_num_digits = 0, len(single_string.split(" ")[0])
    else: 
        ticket_prefix, ticket_num_digits = 1, len(single_string.split(" ")[1])
        
    return(ticket_prefix, ticket_num_digits)

# probably a more efficient way to do this than a loop but that's what we'll do for now
# confusing with python multiple assignments on how to get this to have 'apply'-like functionality
def create_new_ticket_cols(ticket_array):
    ticket_prefix, ticket_num_digits = np.zeros(ticket_array.shape[0]), np.zeros(ticket_array.shape[0])
    for i in range(ticket_array.shape[0]):
        ticket_prefix[i], ticket_num_digits[i] = return_splits(ticket_array[i])

    # numpy is particular about dimensions
    ticket_prefix = ticket_prefix.reshape(-1, 1)
    ticket_num_digits = ticket_num_digits.reshape(-1, 1)
    return(ticket_prefix, ticket_num_digits)

def ticket_preprocessing_pipeline(data, col = 'Ticket'):
    ticket_array = data[col]
    ticket_array = ticket_array.astype("str")
    ticket_prefix, ticket_num_digits = create_new_ticket_cols(ticket_array)
    data['ticket_prefix'] = ticket_prefix
    data['ticket_num_digits'] = ticket_num_digits
    
    # need to keep in mind that we still have the original ticket column
    # need to get rid of this afterwards
    return(data)

train_data_new, test_data_new = ticket_preprocessing_pipeline(train_data), ticket_preprocessing_pipeline(test_data)


In [14]:
train_data_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_prefix,ticket_num_digits
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0,5.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0,5.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1.0,7.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,6.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.0,6.0


In [15]:
train_data.Name

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                      Hewlett, Mrs. (Mary D Kingcome) 
16                                  Rice, Master. Eugene
17                          Wil

So from here, it looks like all of our names have different lengths. It's not immediately clear without more digging, but right now it ~looks~ like all the "titles" have a period afterwards. So let's create a feature column with the title from the name.

In [16]:

# single_string = name_array[10]
def return_name_splits(single_string):
    words = single_string.split(' ')
    boolean_words = ['.' in word for word in words]
    # some logic, to catch the cases where we might not have a title or might have multiple
    if any(boolean_words):
        indexes = [i for i, x in enumerate(boolean_words) if x]
        if len(indexes) > 1:
            val_back = "Multi"
        val_back = words[indexes[0]]
    else:
        val_back = "None"
    return(val_back)

# probably a more efficient way to do this than a loop but that's what we'll do for now
# confusing with python multiple assignments on how to get this to have 'apply'-like functionality
def create_new_title_cols(name_array):
    title_array = np.zeros(name_array.shape[0]).astype("str")
    for i in range(name_array.shape[0]):
        title_array[i] = return_name_splits(name_array[i])

    # numpy is particular about dimensions
    title_array = title_array.reshape(-1, 1)
    return(title_array)

# we have a pd dataframe here so it doesn't really work
# need to rethink this pipeline bc numpy, pandas differences
def title_preprocessing_pipeline(data, col = 'Name'):
    name_array = data[col]
    name_array = name_array.astype("str")
    titles = create_new_title_cols(name_array)
    data['title'] = titles
    
    # need to keep in mind that we still have the originalname column
    # need to get rid of this afterwards
    return(data)

train_data_new, test_data_new = title_preprocessing_pipeline(train_data_new), title_preprocessing_pipeline(test_data_new)


In [17]:
train_data_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_prefix,ticket_num_digits,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1.0,5.0,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1.0,5.0,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1.0,7.0,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.0,6.0,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.0,6.0,Mr.


In [18]:
test_data_new.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_prefix,ticket_num_digits,title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0,6.0,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0.0,6.0,Mrs.
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0,6.0,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0,6.0,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0.0,7.0,Mrs.


We need to have some sort of treatment of all of these categorical variables if we want to do a logistic regression. A lot of machine algorithms can't handle multi-leveled categorical inputs. We can use one-hot-encoding (lots of columns of 1s and 0s) in order to keep the same fundamental information though.

In [19]:
from sklearn.preprocessing import OneHotEncoder

def array_from_pd(item):
    return(item.values.astype("str").reshape(-1,1))
    
title_encoder = OneHotEncoder().fit(array_from_pd(train_data_new.title))
sex_encoder = OneHotEncoder().fit(array_from_pd(train_data_new.Sex))
Pclass_encoder = OneHotEncoder().fit(array_from_pd(train_data_new.Pclass))
Cabin_encoder = OneHotEncoder().fit(array_from_pd(train_data_new.Cabin))
Embarked_encoder = OneHotEncoder().fit(array_from_pd(train_data_new.Embarked))

def create_onehotencoded_features(data):
    # get an appropriate array for all the encoding we have to do
    titles_coded = title_encoder.transform(array_from_pd(data.title)).toarray()
    sex_coded = sex_encoder.transform(array_from_pd(data.Sex)).toarray()
    pclass_coded = Pclass_encoder.transform(array_from_pd(data.Pclass)).toarray()
    cabin_coded = Cabin_encoder.transform(array_from_pd(data.Cabin)).toarray()
    embarked_coded = Embarked_encoder.transform(array_from_pd(data.Embarked)).toarray()
    
    # put all the arrays together
    all_coded = np.concatenate((titles_coded, sex_coded, pclass_coded, cabin_coded, embarked_coded), axis = 1)
    return(all_coded)

arr_return = create_onehotencoded_features(train_data_new)
arr_return.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(891, 174)

In [20]:
arr_return

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

At this point now, we have a super sparse matrix denoting all of our possible categorical variable values. Let's do the same thing for our test data to have consistent preprocessing.

In [21]:
arr_return_test = create_onehotencoded_features(test_data_new)

ValueError: Found unknown categories ['Dona.'] in column 0 during transform

Thought this might be an issue, and turns out it is. We have some values for categorical variables that are only seen in the test data, not in the train set, so our encoder doesn't know how to deal with those. Luckily, according to documentation (https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features) we can set the `handle_unknown = 'ignore` and we will ignore anything not seen in the training data. We could alternatively take all the entries from our training and test data and create the encoding from that, but I think using the `handle_unknown` better replicates a production environment, so we will do it this way.

In [22]:
title_encoder = OneHotEncoder(handle_unknown = 'ignore').fit(array_from_pd(train_data_new.title))
sex_encoder = OneHotEncoder(handle_unknown = 'ignore').fit(array_from_pd(train_data_new.Sex))
Pclass_encoder = OneHotEncoder(handle_unknown = 'ignore').fit(array_from_pd(train_data_new.Pclass))
Cabin_encoder = OneHotEncoder(handle_unknown = 'ignore').fit(array_from_pd(train_data_new.Cabin))
Embarked_encoder = OneHotEncoder(handle_unknown = 'ignore').fit(array_from_pd(train_data_new.Embarked))

arr_return_test = create_onehotencoded_features(test_data_new)
arr_return_train = create_onehotencoded_features(train_data_new)

At this point, we have done all the preprocessing that I care to do, so let's get rid of the columns we have expanded in the training and test data and tack on the one-hot-encoded features.

In [23]:
# find out which column we should be dropping
print(train_data_new.columns)
print(test_data_new.columns)

# only keep the Age, SibSp, Parch, Fare, ticket_prefix, ticket_num_digits cols

train_data_final = train_data_new[['Age', 'SibSp', 'Parch', 'Fare', 'ticket_prefix', 'ticket_num_digits']]
test_data_final = test_data_new[['Age', 'SibSp', 'Parch', 'Fare', 'ticket_prefix', 'ticket_num_digits']]

train_data_final_ar = np.concatenate((train_data_final.values, arr_return_train), axis = 1)
test_data_final_ar = np.concatenate((test_data_final.values, arr_return_test), axis = 1)

print(train_data_final_ar.shape)
print(test_data_final_ar.shape)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'ticket_prefix',
       'ticket_num_digits', 'title'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'ticket_prefix',
       'ticket_num_digits', 'title'],
      dtype='object')
(891, 180)
(418, 180)


Now, finally, we can do some machine learning. We're going to try some logistic regression, decision trees, and randomForest.

In [24]:
from sklearn.linear_model import LogisticRegression
# no need for scaling
y_train = train_data.Survived

log_fit = LogisticRegression().fit(X = train_data_final_ar, y = y_train)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [25]:
np.sum(np.isnan(train_data_final_ar))

177

Turns out we have some NA's in our data, which make it really inconvenient to do further work. We need to deal with these somehow.

In [26]:
train_data.isnull().sum(axis = 0)

PassengerId            0
Survived               0
Pclass                 0
Name                   0
Sex                    0
Age                  177
SibSp                  0
Parch                  0
Ticket                 0
Fare                   0
Cabin                687
Embarked               2
ticket_prefix          0
ticket_num_digits      0
title                  0
dtype: int64

In [27]:
test_data.isnull().sum(axis=0)

PassengerId            0
Pclass                 0
Name                   0
Sex                    0
Age                   86
SibSp                  0
Parch                  0
Ticket                 0
Fare                   1
Cabin                327
Embarked               0
ticket_prefix          0
ticket_num_digits      0
title                  0
dtype: int64

To get around the issue of missing values, we will set them equal to zero because I'm getting tired of preprocessing at this point. A more robust approach might use the medians for each.

In [28]:
train_data_final_ar = np.nan_to_num(train_data_final_ar)
test_data_final_ar = np.nan_to_num(test_data_final_ar)

In [29]:
log_fit = LogisticRegression().fit(X = train_data_final_ar, y = y_train)

log_preds = log_fit.predict(train_data_final_ar)
log_accuracy = accuracy_score(y_pred=log_preds, y_true=y_train)
print("Logistic Regression results:")
print(log_accuracy)
print(confusion_matrix(y_pred=log_preds, y_true=y_train))

Logistic Regression results:
0.8574635241301908
[[492  57]
 [ 70 272]]




Awesome. We created some value over just predicting based on gender, with an accuracy of about 86%. We are doing better with both false positives and false negatives, and more or less equally so for both. With logistic regression, there aren't really too many hyperparameters to speak of, so we will leave this alone for now.

One thing we forgot to do- this model is trained on our entire training set, with no sort of validation set (test set is left external and only really for kaggle use). We will do cross-validation here, because we want to maximize all the data being used for training. 

In [30]:
from sklearn.model_selection import cross_val_score

logistic_reg = LogisticRegression()
scores = cross_val_score(logistic_reg, train_data_final_ar, y_train, cv = 10)
print("Mean accuracy")
print(np.mean(scores))
print("stdev accuracy")
print(np.std(scores))

Mean accuracy
0.8261011803427534
stdev accuracy
0.03420452860078454




Okay, we we can still be relatively certain that our model is performing better than baseline, but perhaps not to the level that we had above.

Let's see if a decision tree can do any better.

In [190]:
from sklearn.tree import DecisionTreeClassifier
# use default hyperparameters
clf = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(clf, train_data_final_ar, y_train, cv = 10)
print("Mean accuracy")
print(np.mean(scores))
print("stdev accuracy")
print(np.std(scores))


Mean accuracy
0.7901191692202929
stdev accuracy
0.0522247658769595


So with a decision tree, we aren't getting as good of results as a logistic regression and it's a little unclear whether these are better than our baseline of about 78%. We should note that the sklearn implementation of a decision tree goes all the way until you have one sample in each leaf node- we shouldn't really expect this to generalize well. With a little hyperparameter tuning, we can change that and see if we get any better performance.

In [195]:
from sklearn.model_selection import GridSearchCV
depths = {'max_depth' : [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100]}
clf = DecisionTreeClassifier(random_state=0)
cv_fit = GridSearchCV(clf, depths, scoring = 'accuracy', cv = 10, n_jobs = -1)
results = cv_fit.fit(train_data_final_ar, y_train)
pd.DataFrame(results.cv_results_)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.005197,0.0004,0.782267,0.782267,1,{'max_depth': 1},14,0.766667,0.78402,0.788889,...,0.764045,0.784289,0.820225,0.778055,0.75,0.785803,0.001834,0.0012,0.033163,0.003681
1,0.006,0.0004,0.789001,0.789002,2,{'max_depth': 2},13,0.744444,0.794007,0.744444,...,0.775281,0.790524,0.797753,0.78803,0.795455,0.788294,0.002002,0.0012,0.037831,0.004209
2,0.0072,0.0008,0.820426,0.830778,3,{'max_depth': 3},1,0.811111,0.838951,0.811111,...,0.808989,0.831671,0.876404,0.831671,0.818182,0.821918,0.001599,0.0016,0.029145,0.00659
3,0.0064,0.0008,0.809203,0.847738,4,{'max_depth': 4},3,0.8,0.853933,0.755556,...,0.797753,0.841646,0.876404,0.845387,0.829545,0.849315,0.001959,0.0016,0.04447,0.006695
4,0.007601,0.001599,0.809203,0.866567,5,{'max_depth': 5},3,0.755556,0.871411,0.788889,...,0.775281,0.86409,0.898876,0.862843,0.795455,0.871731,0.0012,0.001959,0.047582,0.007717
5,0.0116,0.0004,0.810325,0.950367,10,{'max_depth': 10},2,0.766667,0.956305,0.788889,...,0.764045,0.94389,0.865169,0.947631,0.784091,0.96264,0.002154,0.0012,0.045656,0.006805
6,0.010801,0.0,0.79798,0.98703,15,{'max_depth': 15},5,0.755556,0.985019,0.822222,...,0.730337,0.985037,0.853933,0.982544,0.772727,0.990037,0.00358,0.0,0.050715,0.003311
7,0.009811,0.0008,0.791246,0.992518,20,{'max_depth': 20},6,0.766667,0.992509,0.8,...,0.741573,0.993766,0.831461,0.991272,0.772727,0.993773,0.003814,0.0016,0.052471,0.00079
8,0.010799,0.0008,0.790123,0.992518,25,{'max_depth': 25},7,0.766667,0.992509,0.8,...,0.741573,0.993766,0.831461,0.991272,0.772727,0.993773,0.002562,0.0016,0.052199,0.00079
9,0.009813,0.0,0.790123,0.992518,30,{'max_depth': 30},7,0.766667,0.992509,0.8,...,0.741573,0.993766,0.831461,0.991272,0.772727,0.993773,0.004758,0.0,0.052199,0.00079


Alright so with a maximum depth of 3 on our decision tree, we can see that we have a mean test score of about 82%, which is an improvement over the baseline, but not over the logistic regression.

While we could do some visualization and more interpretation here, I'm just going to go straight onto random forests. We will go straight into hyperparameter tuning. The `n_estimators` is how many trees we generate, `min_samples_leaf` is a way that we can control tree depth by specifying the minimum number of samples per leaf, and `max_features` specifies how many variables we test at each node. The idea here is that we want a lot of trees that come to the same decisions independently, and these hyperparameters will ensure that our trees are pretty different.

We should also note that this should be decently computationally expensive...

In [198]:
from sklearn.ensemble import RandomForestClassifier

tstart = time.time()
params = {'n_estimators' : [50, 100, 150, 250, 300, 500],\
         'min_samples_leaf' : [1, 3, 5, 7],\
         'max_features' : [5, 10, 15, 25, 50, 75, 100, 125]}

clf = RandomForestClassifier(random_state = 0)
cv_fit = GridSearchCV(clf, params, scoring = 'accuracy', cv = 10, n_jobs = -1)
results = cv_fit.fit(train_data_final_ar, y_train)

tdiff = time.time() - tstart
print("Took " + str(tdiff) + " seconds")
pd.DataFrame(results.cv_results_).sort_values(['rank_test_score'])


Took 224.49790501594543 seconds




Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_features,param_min_samples_leaf,param_n_estimators,params,rank_test_score,split0_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
112,1.105567,0.065243,0.839506,0.870684,50,5,300,"{'max_features': 50, 'min_samples_leaf': 5, 'n...",1,0.811111,...,0.786517,0.870324,0.876404,0.866584,0.863636,0.864259,0.270026,0.045878,0.044160,0.006539
111,1.118000,0.026000,0.838384,0.870558,50,5,250,"{'max_features': 50, 'min_samples_leaf': 5, 'n...",2,0.822222,...,0.786517,0.870324,0.865169,0.866584,0.863636,0.869240,0.301087,0.004472,0.042516,0.005798
110,0.319600,0.018400,0.838384,0.871181,50,5,150,"{'max_features': 50, 'min_samples_leaf': 5, 'n...",2,0.800000,...,0.786517,0.872818,0.876404,0.869077,0.875000,0.869240,0.012706,0.008617,0.045199,0.006028
113,1.775158,0.073948,0.836139,0.869936,50,5,500,"{'max_features': 50, 'min_samples_leaf': 5, 'n...",4,0.811111,...,0.786517,0.867830,0.865169,0.864090,0.863636,0.865504,0.042311,0.042141,0.043643,0.006023
108,0.134001,0.007999,0.836139,0.868937,50,5,50,"{'max_features': 50, 'min_samples_leaf': 5, 'n...",4,0.788889,...,0.786517,0.871571,0.876404,0.866584,0.852273,0.865504,0.037535,0.005933,0.044947,0.005245
163,0.585534,0.031180,0.836139,0.874549,100,7,100,"{'max_features': 100, 'min_samples_leaf': 7, '...",4,0.800000,...,0.797753,0.871571,0.876404,0.865337,0.829545,0.872976,0.235594,0.017446,0.050083,0.005910
143,1.748309,0.083600,0.836139,0.869686,75,7,500,"{'max_features': 75, 'min_samples_leaf': 7, 'n...",4,0.811111,...,0.786517,0.871571,0.865169,0.865337,0.863636,0.865504,0.133755,0.069384,0.047516,0.005668
109,0.608399,0.028401,0.835017,0.867690,50,5,100,"{'max_features': 50, 'min_samples_leaf': 5, 'n...",8,0.800000,...,0.786517,0.869077,0.876404,0.860349,0.863636,0.866750,0.190793,0.018195,0.043451,0.007560
106,1.186821,0.065290,0.835017,0.893379,50,3,300,"{'max_features': 50, 'min_samples_leaf': 3, 'n...",8,0.822222,...,0.786517,0.892768,0.876404,0.887781,0.863636,0.890411,0.291438,0.044983,0.049566,0.004447
162,0.154419,0.005432,0.835017,0.872929,100,7,50,"{'max_features': 100, 'min_samples_leaf': 7, '...",8,0.788889,...,0.797753,0.871571,0.876404,0.869077,0.863636,0.874222,0.060014,0.004504,0.045657,0.006688


So running this actually killed about 6% of my laptop's battery and took quite a bit, so we know that this was some hefty calculations happening on the back end. But with our best settings we can now have an accuracy of about 87%, which is an improvement over our logistic regression model. 

A few notes- 

* I'm intentionally being lazy and not doing too much visualization at the moment... If I were to try to explain why each of these models is doing what it's doing, I haven't really presented a convincing case here. 
* We have done cross-validation in order to evaluate our data, but the real use case is with the test set. So we would really want to predict on the test set, submit to kaggle, and then see what our results are in order to really see how good we have done. Ideally, cross validation should shot pretty similar results to the evaluation on the test set, but you can't know that till you do it.

In [31]:
# for kaggle submission
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 0)
classifier_fit = clf.fit(train_data_final_ar, y_train)
test_predictions = clf.predict(test_data_final_ar)
test_predictions



array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [32]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,ticket_prefix,ticket_num_digits,title
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0,6.0,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0.0,6.0,Mrs.
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0,6.0,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0,6.0,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0.0,7.0,Mrs.
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S,0.0,4.0,Mr.
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q,0.0,6.0,Miss.
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S,0.0,6.0,Mr.
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C,0.0,4.0,Mrs.
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S,1.0,5.0,Mr.


In [33]:
pass_ids = test_data.PassengerId

In [35]:
submission_df = pd.DataFrame({'PassengerId' : pass_ids, 'Survived' : test_predictions})
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [37]:
submission_df.to_csv("../../datasets/titanic/submission1.csv", index = False)