In [None]:
import pandas as pd 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('./datasets/classification/titanic.csv')

print(data.head(10))

In [None]:
print(data.describe())

In [None]:
# target dataframe: data
target = "Survived"
features = list(data.columns.drop("Survived"))
feature_df = data[features]

prediction_df = data

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
data[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, data.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if data[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if data[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

In [8]:
# train test split
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

In [None]:

##### Model Pipeline for Decision Tree Classifier #####
dt_classifier_param_grid = {
"dt_classifier__max_depth": np.arange(2, 10, 2),
"dt_classifier__min_samples_split": np.arange(2, 10, 2),
"dt_classifier__min_samples_leaf": np.arange(1, 10, 5),
"dt_classifier__min_weight_fraction_leaf": np.arange(0.0, 0.5, 0.25),
"dt_classifier__max_leaf_nodes": np.arange(1, 10, 5),
"dt_classifier__min_impurity_decrease": np.arange(0.0, 0.5, 0.25),
}


# Create the pipeline
dt_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('dt_classifier', DecisionTreeClassifier())
])

# Create the grid search
dt_classifier_grid_search = GridSearchCV(estimator=dt_classifier_pipe, param_grid=dt_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
dt_classifier_grid_search.fit(X_train, y_train)


In [None]:
# Get the best hyperparameters
dt_classifier_best_estimator = dt_classifier_grid_search.best_estimator_

# Store results as a dataframe  
dt_classifier_search_results = pd.DataFrame(dt_classifier_grid_search.cv_results_)

print(dt_classifier_search_results)

In [None]:
# Model metrics

# Generate Predictions
dt_classifier_predictions = pd.DataFrame(dt_classifier_best_estimator.predict(X_test))

dt_classifier_predictions_prob = dt_classifier_best_estimator.predict_proba(X_test)
dt_classifier_predictions_prob_df = pd.DataFrame()
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[0]] = dt_classifier_predictions_prob[:,0]
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]] = dt_classifier_predictions_prob[:,1] 


# Generate Model Metrics
dt_classifier_accuracy = accuracy_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_f1_score = f1_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_precision = precision_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_recall = recall_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_roc_auc_score = roc_auc_score(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
dt_classifier_performance_metrics = [['dt_classifier','accuracy',dt_classifier_accuracy], 
                                  ['dt_classifier','f1_score',dt_classifier_f1_score],
                                  ['dt_classifier','precision', dt_classifier_precision],
                                  ['dt_classifier','recall', dt_classifier_recall],
                                  ['dt_classifier','roc_auc_score', dt_classifier_roc_auc_score]]
dt_classifier_performance_metrics = pd.DataFrame(dt_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
dt_classifier_roc_auc_plot, dt_classifier_roc_auc_plot_ax = plt.subplots()
dt_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
dt_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
dt_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
dt_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
dt_classifier_roc_auc_plot_ax.set_title(f'dt_classifier ROC Curve')
# Add legend
dt_classifier_roc_auc_plot_ax.legend()


print(dt_classifier_performance_metrics[dt_classifier_performance_metrics['metric'] == 'roc_auc_score'])


In [None]:
# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = dt_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
dt_classifier_lift_plot, dt_classifier_lift_plot_ax = plt.subplots()
dt_classifier_lift_plot_ax.set_xlabel('Proportion')
dt_classifier_lift_plot_ax.set_ylabel('Lift')
dt_classifier_lift_plot_ax.set_title(f'dt_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
dt_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
dt_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(dt_classifier_performance_metrics)##### End of Model Pipeline for Decision Tree Classifier #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'roc_auc_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')
