# Test for titanic dataset

## Import lib

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import sklearn

import matplotlib as mpl
import matplotlib.pyplot as plt

import pandas_profiling

In [None]:
%load_ext autoreload
%autoreload 2

## Import data

In [None]:
df_train = pd.read_csv('../data/titanic/train.csv')
df_test = pd.read_csv('../data/titanic/test.csv')

In [None]:
profile = pandas_profiling.ProfileReport(df_train)
profile.to_file(output_file="titanic_pandas_profile.html")

In [None]:
profile

## Select columns

In [None]:
id_cols = ['PassengerId', 'Name', 'Cabin', 'Ticket']
y_cols = ['Survived']
X_cols = [col for col in df_train.columns if col not in (id_cols + y_cols) ]

In [None]:
type_list = df_train[X_cols].dtypes
cat_cols = list(type_list[type_list == 'object'].to_dict().keys())
print(50 * '-')
print('Categorical')
print(cat_cols)
print('')

num_cols = [col for col in X_cols if col not in cat_cols]
print(50 * '-')
print('Numerical')
print(num_cols)
print('')

## Transform objet data

In [None]:
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

### X encoding

In [None]:
label_encoders = defaultdict(LabelEncoder)

In [None]:
df_train[cat_cols] = df_train[cat_cols].apply(
    lambda x: label_encoders[x.name].fit_transform(x.fillna('NAN')))

df_test[cat_cols] = df_test[cat_cols].apply(
    lambda x: label_encoders[x.name].transform(x.fillna('NAN')))

### y label encoding

In [None]:
le_y = defaultdict(LabelEncoder)

df_train[y_cols] = df_train[y_cols].apply(
    lambda x: le_y[x.name].fit_transform(x))

### Fill missing values

#### Get missing values

In [None]:
fill_values = {}
    
for col in num_cols:
    fill_values[col] = df_train[col].median()

#### Replace missing values

In [None]:
df_train = df_train.fillna(value=fill_values)
df_test = df_test.fillna(value=fill_values)

### Get numpy data

In [None]:
X = df_train[X_cols].values
X_test = df_test[X_cols].values

y = df_train[y_cols].values

## Test sklearn

### Stratified shuffle split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

count = 0
for train_index, valid_index in sss.split(X, y):
    # print("TRAIN:", train_index, "VALID:", valid_index)
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

### Random forest hyperoptimization

In [None]:
from catboost import CatBoostClassifier, Pool

from sklearn.metrics import roc_auc_score

In [None]:
nb_estimators = [2, 5, 10, 20, 50, 100, 200, 500, 1000]  #, 2000]
records = []

y_true = np.copy(y_valid)

train_data = Pool(data=X_train.tolist(), label=y_train.ravel().tolist())

for nb_esti in nb_estimators:
    # Define classifier
    clf = CatBoostClassifier(n_estimators=nb_esti, random_seed=42)
    clf.fit(train_data, plot=True, logging_level="Silent")
    
    # Compute scores and metric
    y_scores = clf.predict_proba(X_valid)[:,1]
    auc = roc_auc_score(y_true, y_scores)
    
    # Save results in dictionary
    model_record = {'nb_estimators': nb_esti, 'auc': auc, 'clf': clf}
    records.append(model_record)
    
    print(25*"-")
    print(f"Nb_ estimator: {nb_esti}")
    print(f"AUC {auc}")
    print(25*"=")
    print("")

### Order results

In [None]:
df_records = pd.DataFrame(records)
df_records

### Compare results

In [None]:
plt.figure()
plt.plot(df_records['nb_estimators'], df_records['auc'])
plt.xscale('log')
plt.xlabel('nb_estimators')
plt.ylabel('AUC')
plt.title('Hyperoptimization')
plt.show()

### Select best model

In [None]:
max_auc = df_records['auc'].max()
model_max = df_records[df_records['auc']==max_auc].to_dict('r')[0]
clf_max = model_max['clf'] 
print(model_max)

### Apply best model

In [None]:
y_proba_valid = clf_max.predict_proba(X_valid)[:,1]

y_proba_test = clf_max.predict_proba(X_test)[:,1]
y_predict_test = clf_max.predict(X_test)

#### Numpy

In [None]:
inverse_y_test = list(le_y.values())[0].inverse_transform(y_predict_test.astype(int))

#### Pandas

In [None]:
df_preds = pd.DataFrame(y_predict_test)
df_preds.columns = y_cols
df_preds = df_preds.apply(lambda x: le_y[x.name].inverse_transform(x.astype(int)))

### Find optimal threshold for F1-score

In [None]:
from sklearn.metrics import f1_score

In [None]:
y_proba = np.copy(y_proba_valid)

In [None]:
step = 1.e-3
thresholds = np.arange(0, 1, step)

preds = []
for threshold in thresholds:
    y_pred = np.copy(y_proba)
    y_pred[y_pred > threshold] = 1
    y_pred[y_pred <= threshold] = 0
    
    f1 = f1_score(y_valid, y_pred)
    
    res = {'threshold': threshold, 'f1_score': f1}
    preds.append(res)

In [None]:
df_preds = pd.DataFrame(preds)

### Compare threshold

In [None]:
plt.figure()
plt.plot(df_preds['threshold'], df_preds['f1_score'])
plt.xlabel('threshold')
plt.ylabel('f1_score')
plt.title('Threshold selection')
plt.show()

### Find optimal threshold for f1_score

In [None]:
max_f1 = df_preds['f1_score'].max()
cut_max = df_preds[df_preds['f1_score']==max_f1].to_dict('r')[0]
print(cut_max)

In [None]:
plt.figure()
plt.plot(cut_max['threshold'], cut_max['f1_score'], 'ro', ms=14)
plt.plot(df_preds['threshold'], df_preds['f1_score'])
plt.xlabel('threshold')
plt.ylabel('f1_score')
plt.title('Threshold selection')
plt.show()

# End of script