In [None]:
import gc
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

### 1. Data preparation

In [None]:
# labeled data
df = pd.read_csv('./data/train.csv', encoding='utf-8')

In [None]:
# df.head()
# df.isnull().sum()

In [None]:
df.isna().mean() * 100  # percentage of missed values

In [None]:
# drop insignificant features
X = df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
# feature to predict
y = df[['Survived']]

In [None]:
X = X.fillna({'Age': X.Age.median()})

# male_age_median = X[X['Sex'] == 'male'].Age.median()
# female_age_median = X[X['Sex'] == 'female'].Age.median()

# X = pd.concat([
#     X.loc[X['Sex'] == 'male'].fillna({'Age': male_age_median}),
#     X.loc[X['Sex'] == 'female'].fillna({'Age': female_age_median})
# ])

In [None]:
# X.shape
# X.isna().mean() * 100

In [None]:
# apply one-hot encoding
X = pd.get_dummies(X)

# Label encoding (only if we do not expect any new values for any categorical variable)
#

In [None]:
# # cleanup
# del male_age_median
# del female_age_median
# gc.collect()

### 2.1 Decision Tree Classifier (best depth based on `cross_val_score`)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
max_depth_values = range(1, 100)

In [None]:
scores_data = pd.DataFrame()

In [None]:
for max_depth in max_depth_values:
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    
    mean_cross_val_score = cross_val_score(clf, X_train, y_train, cv=5).mean()
    
    temp_score_data = pd.DataFrame({'max_depth': [max_depth],
                                    'train_score': [train_score],
                                    'test_score': [test_score],
                                    'cross_val_score': [mean_cross_val_score]})
    scores_data = scores_data.append(temp_score_data)

In [None]:
# scores_data_melt = pd.melt(scores_data, id_vars=['max_depth'],
#                            value_vars=['train_score', 'test_score', 'cross_val_score'],
#                            var_name='set_type', value_name='score')

# # sns.set(rc={'figure.figsize':(11.7,8.27)})
# # fig, ax = plt.subplots(1)
# fig = plt.figure(figsize=(12,4))
# ax = fig.add_subplot()
# sns.lineplot(x='max_depth', y='score', hue='set_type', data=scores_data_melt, ax=ax)

In [None]:
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot()
scores_data.plot(x='max_depth', y=['train_score', 'test_score', 'cross_val_score'],
                 legend = True, ax=ax)

In [None]:
scores_data.sort_values(by=['cross_val_score'], ascending=False).head()
# scores_data_melt.query("set_type == 'cross_val_score'").sort_values(by="score", ascending=False).head()

In [None]:
best_clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=6)

In [None]:
best_clf.fit(X_train, y_train)

In [None]:
print(
    'Score:            ' + str(best_clf.score(X_test, y_test)) + "\n" +\
    'Cross-val. score: ' + str(cross_val_score(clf, X_test, y_test, cv=5).mean())
)

### 2.2 Decision Tree Classifier (cross-validated grid-search over a parameter grid)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
clf = tree.DecisionTreeClassifier()

In [None]:
params = {'criterion': ['gini', 'entropy'], 'max_depth': range(1,30)}

In [None]:
gsc_clf = GridSearchCV(clf, params, cv=5)
# grid_search_cv_cls.get_params()

In [None]:
gsc_clf.fit(X_train, y_train)

In [None]:
gsc_clf.best_params_

In [None]:
best_clf = gsc_clf.best_estimator_

In [None]:
# accuracy, precision, recall
#
y_pred = best_clf.predict(X_test)

print(
    'accuracy  : ' + str(best_clf.score(X_test, y_test)) + '\n' +\
    'precision : ' + str(precision_score(y_test, y_pred)) + '\n' +\
    'recall    : ' + str(recall_score(y_test, y_pred)) + '\n'
)

Trade off between recall and precision

In [None]:
# survival probability distribution
#
y_pred_prob = best_clf.predict_proba(X_test)

pd.Series(y_pred_prob[:,1]).hist()

# pd.DataFrame(
#     {'Class_0': y_pred_prob[:,0], 'Class_1': y_pred_prob[:,1]}
# ).head()

In [None]:
# calculated probability values
np.sort(
    pd.Series(y_pred_prob[:,1]).unique()
)

In [None]:
# heuristic: let's consider as survived those with probability >= 0.8
#
y_pred_heu = np.where(y_pred_prob[:,1] >= 0.8, 1, 0)

In [None]:
print('Precision delta : ' + \
      str(precision_score(y_test, y_pred_heu) - precision_score(y_test, y_pred))
)
print('Recall delta    : ' + \
      str(recall_score(y_test, y_pred_heu) - recall_score(y_test, y_pred))
)

### 2.3 ROC, AUC

In [None]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1])
roc_auc= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

### 2.4 Visualize decision tree

In [None]:
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
# from IPython.display import HTML
# style = "<style>svg{width:70% !important;height:70%; !important;}</style>"
# HTML(style)
graph = Source(tree.export_graphviz(best_clf, out_file=None,
                                    feature_names=list(X),
                                    class_names=['Negative', 'Positive'],
                                    filled=True))
display(SVG(graph.pipe(format='svg')))

In [None]:
# tree.plot_tree(best_clf, feature_names=list(X),
#                class_names=['Negative', 'Positive'],
#                filled=True, max_depth=6, fontsize=10);

### 3. Random Forest Classifier

#### 3.1 Simple tree

In [None]:
clf = tree.DecisionTreeClassifier(criterion='entropy',
                                  max_depth=3,
                                  min_samples_split=100,
                                  min_samples_leaf=10)

In [None]:
clf.fit(X_train, y_train)

In [None]:
# graph = Source(tree.export_graphviz(clf, out_file=None,
#                                    feature_names=list(X),
#                                    class_names=['Died', 'Survived'],
#                                    filled = True))
# display(SVG(graph.pipe(format='svg')))

plt.figure(figsize=(40, 20),dpi=80)
p = tree.plot_tree(clf, fontsize=30, filled=True, 
                   feature_names=list(X), class_names=['Died', 'Survived'])

#### 3.2 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_rf = RandomForestClassifier()
parametrs = {'n_estimators': [10,20,30], 'max_depth': [2,5,7,10]}

In [None]:
grid_search_cv_clf = GridSearchCV(clf_rf, parametrs, cv=5)

In [None]:
grid_search_cv_clf.fit(X_train, y_train['Survived'])

In [None]:
grid_search_cv_clf.best_params_

In [None]:
best_clf = grid_search_cv_clf.best_estimator_

In [None]:
best_clf.score(X_test, y_test)

In [None]:
best_clf.feature_importances_

In [None]:
feature_importances = best_clf.feature_importances_

feature_importances_df = pd.DataFrame({'features':list(X_train),
                                       'feature_importances': feature_importances})

feature_importances_df.sort_values('feature_importances', ascending=False)

In [None]:
feature_importances_df.plot.pie(
                        explode=[0.1]*len(X_train.columns),
                        labels = feature_importances_df.features,
                        y = 'feature_importances',
                        autopct='%1.1f%%',
                        shadow=True,
                        legend=False,
                        figsize=(8, 8));