# Titanic Machine Learning Project

The goal of this project is to build a machine learning model that predicts which passengers survived the Titanic with the greatest possible accuracy.

## 1. Imports and Data Cleaning

In this section, I import the data and clean it to prepare it for modeling.

In [1]:
import matplotlib.pyplot as plt

import pandas as pd

import seaborn as sns

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, RocCurveDisplay, log_loss
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

In [2]:
train_df = pd.read_csv('Data/train.csv')

In [3]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
test_df = pd.read_csv('Data/test.csv')

In [5]:
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [6]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Only 38% of passengers survived.

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
train_df.drop('Cabin',inplace=True,axis=1)

In [9]:
test_df.drop('Cabin',inplace=True,axis=1)

### 1a. Cleaning Age Null Values

First, I review some of the records that contain null values for age to see what comes to light.

In [10]:
train_df[train_df['Age'].isna()].head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q
29,30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,S
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,C
32,33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,Q
36,37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,C
42,43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,C


The titles of the passengers stand out. Females have either a Miss or Mrs title and males have either a Mr or Master title. These offer general indications of passenger age, so I use them to approximate the ages of the passengers that have null values for age.

In [11]:
master_df = train_df[train_df['Name'].str.contains('master', case=False)]

In [12]:
master_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,40.0,40.0,40.0,36.0,40.0,40.0,40.0
mean,414.975,0.575,2.625,4.574167,2.3,1.375,34.703125
std,301.717518,0.500641,0.627878,3.619872,1.910833,0.540062,28.051752
min,8.0,0.0,1.0,0.42,0.0,0.0,8.5167
25%,165.75,0.0,2.0,1.0,1.0,1.0,18.75
50%,345.0,1.0,3.0,3.5,1.0,1.0,29.0625
75%,764.0,1.0,3.0,8.0,4.0,2.0,39.171875
max,870.0,1.0,3.0,12.0,8.0,2.0,151.55


The median age for passengers with the title 'master' is 3.5. I find the median age for the remaining three titles and use those values to fill the nulls for those categories.

In [13]:
miss_df = train_df[train_df['Name'].str.contains('Miss', case=False)]

In [14]:
miss_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,182.0,182.0,182.0,146.0,182.0,182.0,182.0
mean,408.884615,0.697802,2.307692,21.773973,0.714286,0.549451,43.797873
std,246.775812,0.460477,0.849989,12.990292,1.431961,0.804184,66.027199
min,3.0,0.0,1.0,0.75,0.0,0.0,6.75
25%,213.0,0.0,1.25,14.125,0.0,0.0,7.95105
50%,381.5,1.0,3.0,21.0,0.0,0.0,15.62085
75%,612.25,1.0,3.0,30.0,1.0,1.0,41.0344
max,889.0,1.0,3.0,63.0,8.0,2.0,512.3292


In [15]:
mrs_df = train_df[train_df['Name'].str.contains('Mrs', case=False)]

In [16]:
mrs_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,129.0,129.0,129.0,112.0,129.0,129.0,129.0
mean,453.806202,0.790698,1.984496,35.642857,0.682171,0.806202,44.731944
std,268.570873,0.408397,0.819532,11.506251,0.599277,1.262918,45.127354
min,2.0,0.0,1.0,14.0,0.0,0.0,7.225
25%,255.0,1.0,1.0,27.0,0.0,0.0,15.85
50%,438.0,1.0,2.0,35.0,1.0,0.0,26.0
75%,679.0,1.0,3.0,44.0,1.0,1.0,55.9
max,886.0,1.0,3.0,63.0,3.0,6.0,247.5208


In [17]:
mr_df = train_df[train_df['Name'].str.contains('Mr', case=False)]

In [18]:
mr_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,647.0,647.0,647.0,511.0,647.0,647.0,647.0
mean,454.585781,0.284389,2.323029,33.118395,0.367852,0.282844,28.537305
std,256.402949,0.451472,0.830426,12.525001,0.797103,0.782193,45.207127
min,1.0,0.0,1.0,11.0,0.0,0.0,0.0
25%,230.0,0.0,2.0,24.0,0.0,0.0,7.8958
50%,464.0,0.0,3.0,31.0,0.0,0.0,13.0
75%,674.5,1.0,3.0,40.0,1.0,0.0,27.75
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [19]:
train_df.loc[train_df['Name'].str.contains('Master', case=False), 'Age'] = train_df.loc[train_df['Name'].str.contains('Master', case=False), 'Age'].fillna(3.5)
train_df.loc[train_df['Name'].str.contains('Miss', case=False), 'Age'] = train_df.loc[train_df['Name'].str.contains('Miss', case=False), 'Age'].fillna(21)
train_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (train_df['Sex'] == 'female'), 'Age'] = train_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (train_df['Sex'] == 'female'), 'Age'].fillna(35)
train_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (train_df['Sex'] == 'male'), 'Age'] = train_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (train_df['Sex'] == 'male'), 'Age'].fillna(31)

In [20]:
train_df['Title'] = train_df['Name'].str.extract(r',\s*([^,]+)\.')

In [21]:
train_df['Title'].value_counts()

Mr                          517
Miss                        182
Mrs                         124
Master                       40
Dr                            7
Rev                           6
Mlle                          2
Col                           2
Major                         2
Lady                          1
Sir                           1
Mrs. Martin (Elizabeth L      1
the Countess                  1
Jonkheer                      1
Don                           1
Capt                          1
Ms                            1
Mme                           1
Name: Title, dtype: int64

In [22]:
adsf = asdkfh

NameError: name 'asdkfh' is not defined

In [None]:
train_df.loc[train_df['Name'].str.contains('Master', case=False), 'Title'] = 'Master'
train_df.loc[train_df['Name'].str.contains('Miss', case=False), 'Title'] = 'Miss'
train_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (train_df['Sex'] == 'female'), 'Title'] = 'Mrs'
train_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (train_df['Sex'] == 'male'), 'Title'] = 'Mr'

In [None]:
train_df[train_df['Title'].isna()].head(25)

In [None]:
test_df.loc[test_df['Name'].str.contains('Master', case=False), 'Age'] = test_df.loc[test_df['Name'].str.contains('Master', case=False), 'Age'].fillna(3.5)
test_df.loc[test_df['Name'].str.contains('Miss', case=False), 'Age'] = test_df.loc[test_df['Name'].str.contains('Miss', case=False), 'Age'].fillna(21)
test_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (test_df['Sex'] == 'female'), 'Age'] = test_df.loc[(test_df['Name'].str.contains('Mr', case=False)) & (test_df['Sex'] == 'female'), 'Age'].fillna(35)
test_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (test_df['Sex'] == 'male'), 'Age'] = test_df.loc[(test_df['Name'].str.contains('Mr', case=False)) & (test_df['Sex'] == 'male'), 'Age'].fillna(31)

In [None]:
test_df.loc[test_df['Name'].str.contains('Master', case=False), 'Title'] = 'Master'
test_df.loc[test_df['Name'].str.contains('Miss', case=False), 'Title'] = 'Miss'
test_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (test_df['Sex'] == 'female'), 'Title'] = 'Mrs'
test_df.loc[(train_df['Name'].str.contains('Mr', case=False)) & (test_df['Sex'] == 'male'), 'Title'] = 'Mr'

In [None]:
train_df.info()

There is still one null value.

In [None]:
train_df[train_df['Age'].isna()]

I use the same age as the one I did for Mr. for this final passenger.

In [None]:
train_df['Age'].fillna(31,inplace=True)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
test_df[test_df['Age'].isna()]

In [None]:
test_df['Age'].fillna(21,inplace=True)

The final null values come from the Embarked column for the train data and the fare data for the test data.

### 1c. Cleaning null values in the Embarked and Fare columns

The Embarked column shows the port of embarkment for each passenger. I add these passengers to the most common port.

In [None]:
train_df['Embarked'].value_counts()

In [None]:
train_df[train_df['Embarked'].isna()]

In [None]:
train_df['Embarked'].fillna('S',inplace=True)

In [None]:
train_df.info()

In [None]:
test_df[test_df['Fare'].isna()]

In [None]:
train_df[train_df['Pclass'] == 3].describe()

In [None]:
test_df['Fare'].fillna(8.05,inplace=True)

In [None]:
test_df.info()

Hooray! There are no more null values. The next step is preparing for modeling.

## 2. Data Preparation and EDA

First, I drop columns with no predictive value.

In [None]:
train_df_filtered = train_df.drop(['PassengerId','Name','Ticket'], axis = 1)
test_df_filtered = test_df.drop(['PassengerId','Name','Ticket'], axis = 1)

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train_df_filtered)
plt.xlabel('Pclass')
plt.ylabel('Survived')
plt.title('Bar Plot of Passenger Class against Survival');

In [None]:
sns.barplot(x='Sex', y='Survived', data=train_df_filtered)
plt.xlabel('Sex')
plt.ylabel('Survived')
plt.title('Bar Plot of Sex against Survival');

In [None]:
train_df_filtered['Age_Bins'] = pd.cut(train_df_filtered['Age'],bins=10)

In [None]:
plt.figure(figsize=(16,8))
sns.barplot(x='Age_Bins', y='Survived', data=train_df_filtered)

plt.xlabel('Age')
plt.ylabel('Survived')
plt.title('Scatter Plot of Age against Survival');

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(x='Age',bins=10,data=train_df_filtered)
plt.title('Age Histogram');

Now I define the X and Y .

In [None]:
X_train = train_df_filtered.drop(['Survived'], axis=1)
y_train = train_df_filtered['Survived']
X_test = test_df_filtered

Next, I define which columns need one hot encoding and which are already numerical.

In [None]:
ohecols = ['Sex','Embarked']

In [None]:
numcols = ['Pclass','Age','SibSp','Parch','Fare']

In [None]:
onehot = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [None]:
ohe_sub = Pipeline([
    ('cat_impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', onehot)
])

In [None]:
num_sub = Pipeline([
    ('cat_impute', SimpleImputer(strategy='most_frequent'))
])

In [None]:
CT = ColumnTransformer(transformers=[
    ('onehot', ohe_sub, ohecols),
    ('numerical', num_sub, numcols)
])

## 3. Modeling

In [None]:
dum_pipe = Pipeline(steps=[
    ('ct', CT),
    ('dummy', DummyClassifier(strategy='most_frequent'))
])

In [None]:
dum_pipe.fit(X_train, y_train)

In [None]:
print(classification_report(y_train, dum_pipe.predict(X_train)))

### 3b. Logistic Regression

In [None]:
lr_pipe = Pipeline(steps=[
    ('ct', CT),
    ('lr', LogisticRegression(max_iter=1000))
])

In [None]:
lr_pipe.fit(X_train, y_train)

In [None]:
print(classification_report(y_train, lr_pipe.predict(X_train)))

In [None]:
ConfusionMatrixDisplay.from_estimator(lr_pipe, X_train, y_train);

In [None]:
params = {
    'fsm__max_iter' : [10, 100, 1000, 10000],
    'fsm__C' : [0.0001, 0.001, 0.01, 0.1, 1],
    'fsm__tol' : [0.0001,0.001,0.01,0.1],
    'fsm__penalty' : ['l2','none'],
    'fsm__class_weight' : [None,'balanced']
}

In [None]:
gs = GridSearchCV(
    estimator = lr_pipe,
    param_grid = params,
    cv = 5,
    verbose = 1
)

In [None]:
#gs.fit(X,y)

In [None]:
#gs.best_params_

In [None]:
lr_gs_pipe = Pipeline(steps=[
    ('ct', CT),
    ('lr', LogisticRegression(C = .1,
                               class_weight = None,
                               max_iter=100,
                               penalty = 'l2',
                               tol = .0001))
])

In [None]:
lr_gs_pipe.fit(X_train,y_train)

In [None]:
print(classification_report(y_train, lr_gs_pipe.predict(X_train)))

In [None]:
lr_coefs = lr_gs_pipe.named_steps['lr'].coef_

In [None]:
lr_coefs

In [None]:
X_transformed = CT.fit_transform(X_train)

In [None]:
X_transformed

In [None]:
feature_names = CT.named_transformers_['onehot'].get_feature_names_out(input_features=ohecols)

In [None]:
postohecols = feature_names.tolist()

In [None]:
allcols = postohecols + numcols

In [None]:
lr_feature_importance_df = pd.DataFrame(lr_coefs, columns = allcols)

In [None]:
lr_feature_importance_df

In [None]:
second_row = (lr_feature_importance_df.iloc[0] - 1) * 100

In [None]:
lr_feature_importance_df = lr_feature_importance_df.append(second_row,ignore_index = True)

In [None]:
lr_feature_importance_df

# Ideas for next time: make features with gender and class; make feature with age and fare; look into the sibling and parent stuff more; build a stack of models