In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

import shap
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# using training set to train and evaluate
titanic = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
# titanic_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
titanic_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
titanic_test.shape

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')
sub.shape

In [None]:
titanic.head()

In [None]:
titanic.drop(['PassengerId','Name','Ticket','Cabin'],axis=1, inplace=True)
titanic.head()

In [None]:
titanic.describe()

In [None]:
titanic.hist(bins=50, figsize=(15,10))
plt.show()

In [None]:
titanic.isna().any()

In [None]:
ax = sns.countplot(x="Survived", data=titanic)

Problems:

- We have some missing data in the columns 'Age' and 'Fare' and 'Embarked'
- The values are not normally distributed 
- The classes are slightly imbalanced 

In [None]:
# split dataset using StratifiedShuffleSplit to avoid imbalanced train and test sets
titanic_labels = pd.DataFrame(titanic['Survived']).copy()
titanic.drop(['Survived'], axis=1, inplace=True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(titanic, titanic_labels):
    titanic_train, titanic_test = titanic.loc[train_index], titanic.loc[test_index]
    titanic_labels_train, titanic_labels_test = titanic_labels.loc[train_index], titanic_labels.loc[test_index]
    
print('Training set:\n', titanic_labels_train['Survived'].value_counts()/len(titanic_labels_train['Survived']))
print('Test set:\n', titanic_labels_test['Survived'].value_counts()/len(titanic_labels_test['Survived']))

In [None]:
# adding jack and rose to the test set so we don't train the model on them
jack = {
    'Survived': 0,
    'Pclass': 3,
    'Sex': 'male',
    'Age': 20,
    'SibSp': 0,
    'Parch': 0,
    'Fare': 7.8,
    'Embarked': 'S'
       }
jack_pd = pd.DataFrame(data=jack, index=[0])

rose = {
    'Survived': 1,
    'Pclass': 1,
    'Sex': 'female',
    'Age': 17,
    'SibSp': 1, # fiance
    'Parch': 1, # mother
    'Fare': 280,
    'Embarked': 'S'
       }
rose_pd = pd.DataFrame(data=rose, index=[0])

jack_and_rose = pd.concat([jack_pd, rose_pd], ignore_index=True)
jack_and_rose_labels = pd.DataFrame(jack_and_rose['Survived']).copy()

titanic_test = pd.concat([titanic_test, jack_and_rose], ignore_index=True)
titanic_labels_test = pd.concat([titanic_labels_test, jack_and_rose_labels], ignore_index=True)

In [None]:
titanic_test.tail()

In [None]:
titanic_labels_test.tail()

In [None]:
# ignoring the sign
ax = sns.heatmap(abs(titanic_train.corr()), vmin=0, vmax=1, annot=True, linewidths=.5, cmap="coolwarm")

In [None]:
ax = sns.pairplot(data=pd.concat([titanic_train, titanic_labels_train], axis=1), hue='Survived', corner=True, plot_kws={"s": 100})

In [None]:
# create data transformation pipelines for categorical and numerical features separately

num_cols = ['Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Pclass', 'Sex', 'Embarked']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), # impute missing values
    ('std_scaler', StandardScaler()) # normalize data
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # impute missing values
    ('one_hot', OneHotEncoder(drop='if_binary')) # categorical -> numerical
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

titanic_train_prep = full_pipeline.fit_transform(titanic_train)
titanic_test_prep = full_pipeline.fit_transform(titanic_test)

titanic_labels_train_prep = titanic_labels_train['Survived'].to_numpy(dtype=int)
titanic_labels_test_prep = titanic_labels_test['Survived'].to_numpy(dtype=int)

In [None]:
# get the final feature names
one_hot_names = full_pipeline.named_transformers_['cat'].steps[1][1].get_feature_names_out(cat_cols)
feature_names = num_cols + list(one_hot_names)

### Kaggle leaderboard top accuracy stands at 0.81328 - Let's beat it!

#### Training a single decision tree

In [None]:
%%time
tree_clf = DecisionTreeClassifier(max_leaf_nodes=5, random_state=42)
tree_clf.fit(titanic_train_prep, titanic_labels_train_prep)
tree_preds = tree_clf.predict(titanic_test_prep)

In [None]:
# using accuracy metric to be consistent with kaggle competition - not necessarily the best for classification problems!
tree_acr = accuracy_score(titanic_labels_test_prep, tree_preds)
print('Tree accuracy:', tree_acr)

#### Training XGBoost

In [None]:
dtrain = xgb.DMatrix(titanic_train_prep, titanic_labels_train_prep)
dtest = xgb.DMatrix(titanic_test_prep, titanic_labels_test_prep)

In [None]:
%%time
# binary classification binary:hinge and "eval_metric": "error"
param = {"max_depth": 6, "eta": 1, "objective": "binary:hinge", "eval_metric": "error"}

# train
bst = xgb.train(param, dtrain)

# run prediction
xgb_preds = bst.predict(dtest)

In [None]:
xgb_acr = accuracy_score(titanic_labels_test_prep, xgb_preds)
print('XGBoost accuracy:', xgb_acr)

#### TODO
- Install MLflow
- Train a few parameters to improve performance, while explaining what they are
- Show comparison of all models trained
- What else could we do?

In [None]:
features_pd = pd.DataFrame(titanic_train_prep, columns=[feature_names])
features_pd

In [None]:
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(features_pd)

In [None]:
shap.summary_plot(shap_values, features_pd, plot_type='bar')

In [None]:
shap.summary_plot(shap_values, features_pd)

**What does the plot show?**
- Positive SHAP value means positive impact on prediction, leading the model to predict 1(e.g. Passenger survived the Titanic). Negative SHAP value means negative impact, leading the model to predict 0 (e.g. passenger didn’t survive the Titanic).
- In this chart, the x-axis stands for SHAP value, and the y-axis has all the features. Each point on the chart is one SHAP value for a prediction and feature. Red color means higher value of a feature. Blue means lower value of a feature. We can get the general sense of features’ directionality impact based on the distribution of the red and blue dots.

**What can we conclude?**
- Higher value of “Sex_male" (male) leads to lower chance of survive
- Higher value of “Pclass_3” (being on 3rd class) leads to lower chance to survive as well
- Higher value of "Parch" (having more parents/children) leads to mixed results but for some passangers means a lower chance of survival
- Lower value of "Embarked_S" (did not embarked in Southampton) leads to higher change of survival
- Lower value of “Fare” leads to lower chance to survive
- etc...

In [None]:
features_pd.tail()

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], features_pd.iloc[0,:], link="logit")

- sex = male has a negative (blue) impact on survival
- parch has a positive (red) impact on survival

In [None]:
shap.force_plot(explainer.expected_value, shap_values[1000,:], features_pd.iloc[1000,:], link="logit")

- sex = female has a positive (red) impact on survival
- no being on the third class has a positive impact on survival
- not having embarked in Cherbourg has a negative impact on survival

In [None]:
features_test_pd = pd.DataFrame(titanic_test_prep, columns=[feature_names])
shap_values_test = explainer.shap_values(features_test_pd)

In [None]:
# jack
jack_id = len(shap_values_test)-2
features_test_pd.iloc[jack_id,:]

In [None]:
shap.force_plot(explainer.expected_value, shap_values_test[jack_id,:], features_test_pd.iloc[jack_id,:], link="logit")

- Being male: -
- Low shap for parch: + ?

In [None]:
# rose
rose_id = len(shap_values_test)-1
features_test_pd.iloc[rose_id,:]

In [None]:
shap.force_plot(explainer.expected_value, shap_values_test[rose_id,:], features_test_pd.iloc[rose_id,:], link="logit")

- Being female: +
- High fare: +
- Not on third class: +