# Imports

In [None]:
from warnings import simplefilter # import warnings filter
simplefilter(action='ignore', category=FutureWarning) # ignore all future warnings

from cdtools.util.pandas_dataframe_operations import compare_dataframes, impute_dataframe
from cdtools.CD_tools import CDML, compare_binary_columns, df_2_xlsx, df_balance, high_correlation_filter, PCA_analyse, voting_classifier, zero_variance_columns
from cdtools.dataprocessing.feature_engineering import get_feature_lists
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Configuration

In [None]:
# Show figures inside the notebook
%matplotlib inline

# Show all columns inside the dataframe
pd.options.display.max_columns = None

# Set default figure size of figures in the notebook
plt.rcParams['figure.figsize'] = [16, 8]

# Set fonts
plt.rcParams['font.family'] = 'DeJavu Serif'
plt.rcParams['font.serif'] = ['Times New Roman']

In [None]:
input_data_directory = "../data/input/titanic/"
output_data_directory = "../data/output/titanic/"

# Read data

In [None]:
column_data_types = {
    "PassengerId": str, # Passenger identifier.
    #"Name": str, # Passenger name.
    "Sex": str, # Gender.
    "Age": float, # Age in years.
    "SibSp": int, # Number of siblings / spouses aboard the Titanic.
    "Parch": int, # Number of parents / children aboard the Titanic.
    "Embarked": str, # Port of embarkation.
    #"Ticket": str, # Ticket number.
    "Pclass": str, # Ticket class.
    #"Cabin": str, # Cabin number.    
    "Fare": float, # Passenger fare.
    "Survived": bool, # Survival indicator.
    }

keys = ["PassengerId"]
labels = ["Survived"]
features, features_categorical, features_numeric, features_boolean = (
    get_feature_lists(column_data_types,keys,labels)
    )
features_categorical_classes_2_drop = ["male","S","3"]

df_train_data = pd.read_csv(input_data_directory+"train.csv",dtype=column_data_types)
df_train_data = df_train_data[keys + features + labels]
df_test_data = pd.read_csv(input_data_directory+"test.csv",dtype=column_data_types)
df_test_data = df_test_data[keys + features]

# Data preprocessing

### Impute missing values

In [None]:
print("Number of missing values per column before imputation.")
display(df_train_data.isnull().sum())
df_train_data = impute_dataframe(df_train_data)
print("Number of missing values per column after imputation.")
display(df_train_data.isnull().sum())

In [None]:
print("Number of missing values per column before imputation.")
display(df_test_data.isnull().sum())
df_test_data = impute_dataframe(df_test_data)
print("Number of missing values per column after imputation.")
display(df_test_data.isnull().sum())

# Build a classification model with default settings

In [None]:
rc = CDML(df_train_data,column_data_types,keys,labels,features_categorical_classes_2_drop=features_categorical_classes_2_drop)
rc.split_data(test_size=0.1,random_state=0,sampling=None)
rc.classification_model_data("RandomForestClassifier(random_state=0,n_estimators=100)",threshold=0.5)
rc.show_learning_curve(ylim=(0,1.1),cv=StratifiedKFold(12),n_jobs=4,train_sizes=np.linspace(0.3,1.0,10),scoring='roc_auc')
rc.classification_show_ROC_precision_recall_curves(show_labels='Y',label_interval=6,label_offsets_ROC=[15,-20],label_offsets_PR=[-30,-30])
rc.classification_show_prediction_distributions(nrbins=51)
rc.save_feature_importance()
rc.classification_show_interpretation_table(sort_columns='Y',top=5)
rc.classification_transpose_interpretation_table()
rc.classification_show_interpretation_table_LIME(top=5,num_samples=5000)
rc.classification_transpose_interpretation_table_LIME()

# Optimize the classification model using hyperparameter tuning

In [None]:
# Setup hyperparameter search grid
random_grid = {'n_estimators': [10,50,100,150,200,250,500],
               'criterion': ['gini','entropy','log_loss'],
               'max_depth': [5,10,None],
               'min_samples_split': [2,10,50,100,500],
               'min_samples_leaf': [1,10,50,100,500],
               'min_weight_fraction_leaf': [0.0],
               'max_features': ['sqrt', 'log2', None],
               'max_leaf_nodes': [None],
               'min_impurity_decrease': [0.0],
               'bootstrap': [True],
               'oob_score': [False],
               'n_jobs': [-1],
               'random_state': [0],
               'verbose': [0],
               'warm_start': [False],
               'class_weight': [None],
               'ccp_alpha': [0.0],
               'max_samples': [None]
              }

# Search the hyperparameter grid for the optimal hyperparameters
rc = CDML(df_train_data,column_data_types,keys,labels,features_categorical_classes_2_drop=features_categorical_classes_2_drop)
rc.split_data(test_size=0.1,random_state=0,sampling=None)
rc.classification_model_data("RandomForestClassifier(random_state=0)",threshold=0.5) # Run a model with default settings
rc.RandomizedSearchCV(random_grid,n_iter=100)
best_estimator = rc.model.randomized_search_CV.random_search.best_estimator_
print("Best estimator:")
print(best_estimator)
print()
# Run the model with the optimal hyperparameters
rc.classification_model_data(str(best_estimator),threshold=0.5) # Run the model with the optimal hyperparameters
rc.show_learning_curve(ylim=(0,1.1),cv=StratifiedKFold(12),n_jobs=4,train_sizes=np.linspace(0.3,1.0,10),scoring='roc_auc')
rc.classification_show_ROC_precision_recall_curves(show_labels='Y',label_interval=6,label_offsets_ROC=[15,-20],label_offsets_PR=[-30,-30])
rc.classification_show_prediction_distributions(nrbins=51)
rc.save_feature_importance()
rc.classification_show_interpretation_table(sort_columns='Y',top=5)
rc.classification_transpose_interpretation_table()
rc.classification_show_interpretation_table_LIME(top=5,num_samples=5000)
rc.classification_transpose_interpretation_table_LIME()