In [21]:
import pandas as pd
import numpy as np
from scipy import stats
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import re

In [4]:
# Import Titanic Dataset
data_url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(data_url)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


From the info we can get the shape of the data (891, 12) and the datatypes which we can categorize as categorical and numerical. Note some of these could be of different types but a NaN will result in a different datatype.

In [7]:
# Create list of numerical features for visualization
numerical = ['Pclass','SibSp','Parch','Fare', 'Age', 'Survived']

In [8]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

We can see we have quite a few missing values, especially in comparasion to the size of our dataset. Before properly dealing with this, I'm going to impute the missing values with the median. This is so we have a nice copy of the dataframe for visualization. I will then run the original dataset through a pipeline where a more robust imputer will be applied. 

In [9]:
df_visual = df.copy()
df_visual[numerical] = df_visual[numerical].fillna(df[numerical].median())
df_visual.isna().sum()

df_visual

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [10]:
fig = make_subplots(rows=2, cols=3, subplot_titles=numerical)

for i, col in enumerate(numerical):
    fig.add_trace(
        go.Histogram(x=df_visual[col], name=col), 
        row=(i//3)+1, col=(i%3)+1
    )

fig.show()

We can see the count breaksdowns of our numerical features. Some notable features are most people landing in 'Pclass: 3', The ages falling mostly in the 20-40 category. To visualize our categorical features we need to do some feature extraction and other preprocessing.

In [11]:
# Use label encoder on Cabin, Embarked and Sex

label_encoder = LabelEncoder()

df_visual['Embarked']=label_encoder.fit_transform(df_visual['Embarked'])
df_visual['Cabin']=label_encoder.fit_transform(df_visual['Cabin'])
df_visual['Sex']= label_encoder.fit_transform(df_visual['Sex'])

df_visual[['Cabin', 'Embarked', 'Sex']]

Unnamed: 0,Cabin,Embarked,Sex
0,147,2,1
1,81,0,0
2,147,2,0
3,55,2,0
4,147,2,1
...,...,...,...
886,147,2,1
887,30,2,0
888,147,2,0
889,60,0,1


In [12]:
# Extract Titles from Name then use label encoder
df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=True)
df_visual['Title'] = label_encoder.fit_transform(df['Title'])

# print(df[['Name', 'Title']])

We performed title extraction for the names, however deriving ticket features would be difficult so we are going to drop the column all together. 

In [13]:
df_visual.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [14]:
df_visual

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,1,0,3,1,22.0,1,0,7.2500,147,2,12
1,2,1,1,0,38.0,1,0,71.2833,81,0,13
2,3,1,3,0,26.0,0,0,7.9250,147,2,9
3,4,1,1,0,35.0,1,0,53.1000,55,2,13
4,5,0,3,1,35.0,0,0,8.0500,147,2,12
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.0,0,0,13.0000,147,2,15
887,888,1,1,0,19.0,0,0,30.0000,30,2,9
888,889,0,3,0,28.0,1,2,23.4500,147,2,9
889,890,1,1,1,26.0,0,0,30.0000,60,0,12


In [19]:
numerical.append('PassengerId')
remaining_cols = [col for col in df_visual.columns if col not in numerical]

fig = make_subplots(rows=2, cols=2, subplot_titles=remaining_cols)

for i, col in enumerate(remaining_cols):
    fig.add_trace(
        go.Histogram(x=df_visual[col], name=col), 
        row=(i//2)+1, col=(i%2)+1
    )

fig.show()

For Cabin we can assume the most frequent is the NaN value.

In [22]:
correlation = df_visual.corr()

fig = px.imshow(correlation, x=correlation.columns, y=correlation.columns, color_continuous_scale='RdBu')
fig.show()


We can see that Sex, Pclass, and Survived have some correlation with Survived. They are not highly correlated which means we don't need to filter them out for multicolinearity.

In [23]:
df_visual.drop(['PassengerId'], axis=1, inplace=True)

In [26]:
z_scores = stats.zscore(df_visual)

outliers = (z_scores > 3).any(axis=1)

print(f"Number of outliers: {outliers.sum()}")

Number of outliers: 71


This outlier size is large compared to our dataset. I beleive the best move would to leave the outliers in as the nature of them isn't mistakes but rather natural.

In [16]:
desired_columns = ['Sex', 'Embarked', 'Cabin', 'Title']
remaining_columns = [col for col in df.columns if col not in desired_columns]
remaining_columns.remove('Survived')
remaining_columns.remove('Name')
remaining_columns.remove('Ticket')

remaining_columns


['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

In [17]:
# Y, X = df['Survived'], df.drop(columns=['Survived', 'Name', 'Ticket'])
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# X_train

# tuning_params = {
#     'pca_transformer__pca_transform__pca__n_components': [5, 10, 15],
#     'gb__learning_rate': [0.001, 0.01, 0.1, 0.2],  
#     'gb__max_depth': [3, 4, 5, 6],  
# }


# encoded_pca_pipeline = Pipeline(steps=[
#     ('encoder', OneHotEncoder(handle_unknown='ignore')),
#     ('pca', TruncatedSVD()),
#     ('imputer', IterativeImputer())
# ])

# pca_transformer = ColumnTransformer(
#     transformers=[
#         ('pca_transform', encoded_pca_pipeline, desired_columns),
#         ('imputer', IterativeImputer(), remaining_columns),
#         ('scaler', MinMaxScaler(), remaining_columns),
#     ],
#     remainder="passthrough"
# )

# pipeline = Pipeline(steps=[
#     ('pca_transformer', pca_transformer),
#     ('gb', GradientBoostingClassifier())
# ])

# clf = GridSearchCV(pipeline, tuning_params, cv=5, verbose=0)

# best_model = clf.fit(X_train, Y_train)

# Y_pred = best_model.predict(X_test)

# Y_proba = best_model.predict_proba(X_test)[:, 1]

# accuracy = accuracy_score(Y_test, Y_pred)
# print(f"Accuracy: {accuracy}")

# precision = precision_score(Y_test, Y_pred)
# print(f"Precision: {precision}")

# recall = recall_score(Y_test, Y_pred)
# print(f"Recall: {recall}")

# f1 = f1_score(Y_test, Y_pred)
# print(f"F1 Score: {f1}")

# roc_auc = roc_auc_score(Y_test, Y_proba)
# print(f"ROC-AUC Score: {roc_auc}")


From the best model (Ran on HPC) we can transform the original dataset.

In [18]:
# preprocessing_pipeline = best_model.best_estimator_.named_steps['pca_transformer']

# X_new_transformed = pd.DataFrame(preprocessing_pipeline.transform(df))

# X_new_transformed.to_csv('titanic_processed.csv', index=False)