In [91]:
# import general purpose libraries


import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
# from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


%matplotlib inline

In [92]:
# read in the data
train_data = pd.read_csv("../../datasets/titanic/train.csv")
test_data = pd.read_csv("../../datasets/titanic/test.csv")

In [93]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [94]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


So immediately, we can notice quite a few things here:
* We are predicting the variable `Survived`, which is a binary 1/0 classification
* We have a few different classes on the boat
* The names are kinda irregular, we probably want to do something here
* Tickets have irregular formatting
* Cabins have missing values

In short, it looks like we will have some feature engineering to do in order to model this.

Let's take a look at the distribution of survival

In [95]:
train_data.groupby('Survived').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,549,549,549,549,424,549,549,549,549,68,549
1,342,342,342,342,290,342,342,342,342,136,340


So we have 549 that are class 0 (presumably, not survived) and 342 class 1 (presumably, survived). We are probably okay in saying that we have enough of each class that we don't have to do anything special to have enough of each class to make accurate predictions. Let's do a little bit more exploratory analysis to figure out a solid baseline, to figure out if a model presents any real value.

In [96]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [97]:
train_data.groupby(['Survived', 'Sex']).count()['PassengerId']

Survived  Sex   
0         female     81
          male      468
1         female    233
          male      109
Name: PassengerId, dtype: int64

In [98]:
train_data.groupby(['Survived', 'Pclass']).count()['PassengerId']

Survived  Pclass
0         1          80
          2          97
          3         372
1         1         136
          2          87
          3         119
Name: PassengerId, dtype: int64

In [99]:
train_data.groupby(['Survived', 'Embarked']).count()['PassengerId']

Survived  Embarked
0         C            75
          Q            47
          S           427
1         C            93
          Q            30
          S           217
Name: PassengerId, dtype: int64

It looks like a pretty clear division for survival is the sex of the passenger, as females look like they fared much better. I also may have seen a few notebooks on this dataset and know that this tends to be a pretty standard division...

Let's write a function for our baseline of predicting a 1 for female, and 0 for male. Also, we can assess the accuracy and/or any other metric 

In [100]:
# note that you need to turn the pandas dataframe into a numpy array
train_data_array = train_data.values
train_data_array.shape
predictions = np.array([train_data_array[x][4] == "female" for x in range(train_data_array.shape[0])]).astype("int")
baseline_accuracy = accuracy_score(y_pred=predictions, y_true=train_data.Survived)
print("Baseline results:")
print(baseline_accuracy)
print(confusion_matrix(y_pred=predictions, y_true=train_data.Survived))

Baseline results:
0.7867564534231201
[[468  81]
 [109 233]]


Okay, so with our current train/test split, we see that we are 78.7% accurate if we just guess based on gender. So if anything isn't better than this, our model is pretty much useless. We can note here that we have more false positives than false negatives.

With a baseline under our belt, we can look into doing some feature engineering for machine learning.

The first thing we will do is take a look into the tickets, and see if there appears to be any sort of patterns here.

In [101]:
train_data.Ticket

0             A/5 21171
1              PC 17599
2      STON/O2. 3101282
3                113803
4                373450
5                330877
6                 17463
7                349909
8                347742
9                237736
10              PP 9549
11               113783
12            A/5. 2151
13               347082
14               350406
15               248706
16               382652
17               244373
18               345763
19                 2649
20               239865
21               248698
22               330923
23               113788
24               349909
25               347077
26                 2631
27                19950
28               330959
29               349216
             ...       
861               28134
862               17466
863            CA. 2343
864              233866
865              236852
866       SC/PARIS 2149
867            PC 17590
868              345777
869              347742
870              349248
871             

It looks all of these have one numeric term, and then one optional character string. We can separate these out into two vectors. The first one will be binary for the presence of the character string, and the second will just have the number of digits of the numeric part. I'm operating under the assumption that each individual ticket name shouldn't be treated as a numeric and they all don't mean anything.

In [102]:
ticket_array = train_data.Ticket 
ticket_array = ticket_array.astype("str")

def return_splits(single_string):

    if len(single_string.split(" ")) == 1:
        ticket_prefix, ticket_num_digits = 0, len(single_string.split(" ")[0])
    else: 
        ticket_prefix, ticket_num_digits = 1, len(single_string.split(" ")[1])
        
    return(ticket_prefix, ticket_num_digits)

# probably a more efficient way to do this than a loop but that's what we'll do for now
# confusing with python multiple assignments on how to get this to have 'apply'-like functionality
def create_new_ticket_cols(ticket_array):
    ticket_prefix, ticket_num_digits = np.zeros(ticket_array.shape[0]), np.zeros(ticket_array.shape[0])
    for i in range(ticket_array.shape[0]):
        ticket_prefix[i], ticket_num_digits[i] = return_splits(ticket_array[i])

    # numpy is particular about dimensions
    ticket_prefix = ticket_prefix.reshape(-1, 1)
    ticket_num_digits = ticket_num_digits.reshape(-1, 1)
    return(ticket_prefix, ticket_num_digits)

def ticket_preprocessing_pipeline(data, col = 'Ticket'):
    ticket_array = data[col]
    ticket_array = ticket_array.astype("str")
    ticket_prefix, ticket_num_digits = create_new_ticket_cols(ticket_array)
    newdata = np.append(data, ticket_prefix, axis = 1)
    newdata = np.append(newdata, ticket_num_digits, axis = 1)
    
    # need to keep in mind that we still have the original ticket column
    # need to get rid of this afterwards
    return(newdata)

train_data_new, test_data_new = ticket_preprocessing_pipeline(train_data), ticket_preprocessing_pipeline(test_data)


In [103]:
train_data.Name

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                      Hewlett, Mrs. (Mary D Kingcome) 
16                                  Rice, Master. Eugene
17                          Wil

So from here, it looks like all of our names have different lengths. It's not immediately clear without more digging, but right now it ~looks~ like all the "titles" have a period afterwards. So let's create a feature column with the title from the name.

In [124]:

# single_string = name_array[10]
def return_name_splits(single_string):
    words = single_string.split(' ')
    boolean_words = ['.' in word for word in words]
    # some logic, to catch the cases where we might not have a title or might have multiple
    if any(boolean_words):
        indexes = [i for i, x in enumerate(boolean_words) if x]
        if len(indexes) > 1:
            val_back = "Multi"
        val_back = words[indexes[0]]
    else:
        val_back = "None"
    return(val_back)

# probably a more efficient way to do this than a loop but that's what we'll do for now
# confusing with python multiple assignments on how to get this to have 'apply'-like functionality
def create_new_title_cols(name_array):
    title_array = np.zeros(name_array.shape[0]).astype("str")
    for i in range(name_array.shape[0]):
        title_array[i] = return_name_splits(name_array[i])

    # numpy is particular about dimensions
    title_array = title_array.reshape(-1, 1)
    return(title_array)

name_array = train_data.Name
name_array = name_array.astype("str")

# we have a pd dataframe here so it doesn't really work
# need to rethink this pipeline bc numpy, pandas differences
def title_preprocessing_pipeline(data, col = 'Name'):
    name_array = data[col]
    name_array = name_array.astype("str")
    titles = create_new_title_cols(name_array)
    newdata = np.append(data, titles, axis = 1)
    
    # need to keep in mind that we still have the originalname column
    # need to get rid of this afterwards
    return(newdata)


train_data_new, test_data_new = title_preprocessing_pipeline(train_data_new), title_preprocessing_pipeline(test_data_new)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [121]:
# find out which column we should be dropping
print(train_data.columns)
print(test_data.columns)

train_data_new = np.delete(train_data_new, [8], axis=1)
test_data_new = np.delete(test_data_new, [7], axis=1)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
