Titanic Machine Learning


Step 1: Retrieve the Installations

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

Step 2: Load the data

In [19]:
# Importing the dataset
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [21]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

In [22]:
# profile.to_notebook_iframe()

Step 2: Preprocessing

In [23]:
def preprocessing(df):
    #Fill in null values
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('U', inplace=True)
    df['Embarked'].fillna('S', inplace=True)
    df['Fare'].fillna(df['Fare'].mean(), inplace=True)
    #Normalize Fare
    df['Fare'] = np.log1p(df['Fare'])

df_0 = df.copy()
preprocessing(df_0)



Step 3: Feature Enginnering

In [24]:

def feature_engineering(df):
    #Create Deck 
    df['Deck'] = df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')
    df['Sex'] = df['Sex'].apply(lambda s: 1 if s == 'male' else 0)
    #Create Age*Class
    df['Age*Class'] = df['Age']**2 * df['Pclass']/6400
    #drop 
    df.drop(['PassengerId', 'Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

feature_engineering(df_0)


In [25]:
df_0.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck,Age*Class
0,0,3,1,22.0,1,0,2.110213,S,U,0.226875
1,1,1,0,38.0,1,0,4.280593,C,C,0.225625
2,1,3,0,26.0,0,0,2.188856,S,U,0.316875
3,1,1,0,35.0,1,0,3.990834,S,C,0.191406
4,0,3,1,35.0,0,0,2.202765,S,U,0.574219


 Step 4: Model the Data


In [26]:
#one hot encoding for categorical variables - title, Deck, Embarked
df_0 = pd.get_dummies(df_0, columns=['Deck', 'Embarked'])
df_0.drop(['Deck_T'], axis = 1, inplace=True)

In [27]:
df_0.info

<bound method DataFrame.info of      Survived  Pclass  Sex        Age  SibSp  Parch      Fare  Age*Class  \
0           0       3    1  22.000000      1      0  2.110213   0.226875   
1           1       1    0  38.000000      1      0  4.280593   0.225625   
2           1       3    0  26.000000      0      0  2.188856   0.316875   
3           1       1    0  35.000000      1      0  3.990834   0.191406   
4           0       3    1  35.000000      0      0  2.202765   0.574219   
..        ...     ...  ...        ...    ...    ...       ...        ...   
886         0       2    1  27.000000      0      0  2.639057   0.227813   
887         1       1    0  19.000000      0      0  3.433987   0.056406   
888         0       3    0  29.699118      1      2  3.196630   0.413455   
889         1       1    1  26.000000      0      0  3.433987   0.105625   
890         0       3    1  32.000000      0      0  2.169054   0.480000   

     Deck_A  Deck_B  Deck_C  Deck_D  Deck_E  Deck_F  De

In [28]:
profile = ProfileReport(df_0, title='Pandas Profiling Report', explorative=True)

In [29]:
# profile.to_notebook_iframe()

Step 5: Setup Pipeline

    - Feature and Target Values - X, y
    - One hot encode categorical features
    - Train, holdout split
    - Train on multiple algorithms

In [30]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [31]:
X = df_0.drop('Survived', axis=1)
y = df_0['Survived']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [33]:
#create models pipeline
pipeline = {
    'lg': make_pipeline(StandardScaler(), LogisticRegression()),
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}

In [34]:
LogisticRegression().get_params()
# RandomForestClassifier().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [35]:
grid = {
    'lg':  { 
        'logisticregression__C': [0.1, 1.0, 10, 100, 1000],
    },
    'rf': { 
        'randomforestclassifier__n_estimators': [1, 10, 100, 200, 300, 400, 500, 1000] 
    },
    'gb':  { 
        'gradientboostingclassifier__n_estimators': [1, 10, 100, 200, 300, 400, 500, 1000] 
    }

}


In [36]:
pipeline.items()

dict_items([('lg', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])), ('rf', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier', RandomForestClassifier())])), ('gb', Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gradientboostingclassifier', GradientBoostingClassifier())]))])

In [37]:
#fit the models
models = {}
for algo, pipe in pipeline.items():
    print(f'Fitting {algo}...')
    #create new grid search for each model
    model = GridSearchCV(pipe, grid[algo], cv=5, n_jobs=-1)
    #fit the model
    model.fit(X_train, y_train)
    models[algo] = model
    print(f'Done fitting {algo}.')
    


Fitting lg...
Done fitting lg.
Fitting rf...
Done fitting rf.
Fitting gb...
Done fitting gb.


Step 6: Evaluating Performance

In [38]:
#Evaluate the models
for algo, model in models.items():
    yhat = model.predict(X_test)
    #Evaluate the model
    print(f'Metrics for {algo} \n Accuracy: {accuracy_score(y_test, yhat)} \n Precision: {precision_score(y_test, yhat)} \n Recall: {recall_score(y_test, yhat)} \n F1: {f1_score(y_test, yhat)}')
    #Create a confusion matrix
    print(f'Confusion Matrix for {algo}: \n {confusion_matrix(y_test, yhat)} \n')
    #Create a classification report
    # print(f'Classification Report for {algo}: \n {classification_report(y_test, yhat)} \n')
    

Metrics for lg 
 Accuracy: 0.7761194029850746 
 Precision: 0.7477477477477478 
 Recall: 0.7217391304347827 
 F1: 0.7345132743362831
Confusion Matrix for lg: 
 [[125  28]
 [ 32  83]] 

Metrics for rf 
 Accuracy: 0.7798507462686567 
 Precision: 0.7978723404255319 
 Recall: 0.6521739130434783 
 F1: 0.7177033492822966
Confusion Matrix for rf: 
 [[134  19]
 [ 40  75]] 

Metrics for gb 
 Accuracy: 0.7798507462686567 
 Precision: 0.868421052631579 
 Recall: 0.5739130434782609 
 F1: 0.6910994764397906
Confusion Matrix for gb: 
 [[143  10]
 [ 49  66]] 



Save the Model

In [39]:
import pickle

In [40]:
with open('gradientboosted.pkl', 'wb') as f: 
  pickle.dump(models['gb'], f)

with open('gradientboosted.pkl', 'rb') as f: 
  reloaded_model = pickle.load(f)
     

Predicting on Test Class

In [41]:
# Deep copy testing data
df_t = test.copy()


In [42]:
# Run through the preocessing pipeline
preprocessing(df_t)
feature_engineering(df_t)
# One hot encode categorical variables
df_t = pd.get_dummies(df_t, columns=['Deck', 'Embarked'])

df_t.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Age*Class', 'Deck_A',
       'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_U',
       'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [43]:
df_t.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Age*Class     0
Deck_A        0
Deck_B        0
Deck_C        0
Deck_D        0
Deck_E        0
Deck_F        0
Deck_G        0
Deck_U        0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [44]:
len(df_t.columns) == len(X.columns)

True

In [45]:
yhat_test = models['gb'].predict(df_t)