Clone the respository

In [None]:
!git clone https://github.com/Zerve-AI/pypelines.git

Installing the pypeline

In [None]:
import os
folder = ''
os.chdir(f'{folder}/pypelines')

In [None]:
!pip install .

LIST OF MODELS

MODELS FOR CLASSIFICATION PROBLEM

In [None]:
import pypelines.supervised_pipeline as pipe
from pypelines import utils


utils.list_supported_models(model_type='classification')

CLASSIFICATION

SINGLE CLASSIFICATION MODEL

Data Load and Model Selection

In [10]:
titanic = pd.read_csv("pypelines/datasets/classification/titanic.csv")
clf_pypelines_all = pipe.SupervisedPipeline(data = titanic,target = 'Survived',predictions_data=titanic
                            , model_type = 'classification'
                            , models = ['Decision Tree Classifier']
                            , nfolds = 5)

Default Hyperparameters

In [None]:
clf_pypelines_all.get_hyperparameters()

Model tranining code generation

In [29]:
clf_pypelines_all.code_to_clipboard()

Printing Hyperparameters

In [None]:
print(clf_pypelines_all.model_grid_search_settings(model_name='Decision Tree Classifier'))

Updating Hyperparameters

In [None]:
hyperparameter = {
    'numerical': [
        {'search': True, 'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
        {'search': True, 'name': 'min_samples_split', 'min': 2, 'max': 10, 'step': 2},
        {'search': True, 'name': 'min_samples_leaf', 'min': 1, 'max': 10, 'step': 5},
        {'search': True, 'name': 'min_weight_fraction_leaf', 'min': 0.0, 'max': 0.5, 'step': 0.25},
        {'search': True, 'name': 'max_leaf_nodes', 'min': 1, 'max': 10, 'step': 5},
        {'search': True, 'name': 'min_impurity_decrease', 'min': 0.0, 'max': 0.5, 'step': 0.25}
    ],
    'categorical': [
        {'search': False, 'name': 'criterion', 'selected': ['gini'], 'values': ['gini', 'entropy']},
        {'search': False, 'name': 'splitter', 'selected': ['best'], 'values': ['best', 'random']},
        {'search': False, 'name': 'max_features', 'selected': ['auto'], 'values': ['auto', 'sqrt', 'log2']},
    ]
}

print(clf_pypelines_all.set_model_grid_search_settings(hyperparam_dict=hyperparameter, model_name='Decision Tree Classifier'))

Training code for single classification model

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# target dataframe: titanic
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

prediction_df = titanic

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, titanic.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if titanic[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if titanic[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = titanic[features]
y = titanic[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####


##### Model Pipeline for Decision Tree Classifier #####

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
dt_classifier_param_grid = {
"dt_classifier__max_depth": np.arange(2, 10, 2),
"dt_classifier__min_samples_split": np.arange(2, 10, 2),
"dt_classifier__min_samples_leaf": np.arange(1, 10, 5),
"dt_classifier__min_weight_fraction_leaf": np.arange(0.0, 0.5, 0.25),
"dt_classifier__max_leaf_nodes": np.arange(1, 10, 5),
"dt_classifier__min_impurity_decrease": np.arange(0.0, 0.5, 0.25),
}


# Create the pipeline
dt_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('dt_classifier', DecisionTreeClassifier())
])

# Create the grid search
dt_classifier_grid_search = GridSearchCV(estimator=dt_classifier_pipe, param_grid=dt_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
dt_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
dt_classifier_best_estimator = dt_classifier_grid_search.best_estimator_

# Store results as a dataframe  
dt_classifier_search_results = pd.DataFrame(dt_classifier_grid_search.cv_results_)

# Model metrics

# Generate Predictions
dt_classifier_predictions = pd.DataFrame(dt_classifier_best_estimator.predict(X_test))

dt_classifier_predictions_prob = dt_classifier_best_estimator.predict_proba(X_test)
dt_classifier_predictions_prob_df = pd.DataFrame()
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[0]] = dt_classifier_predictions_prob[:,0]
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]] = dt_classifier_predictions_prob[:,1] 


# Generate Model Metrics
dt_classifier_accuracy = accuracy_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_f1_score = f1_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_precision = precision_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_recall = recall_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_roc_auc_score = roc_auc_score(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
dt_classifier_performance_metrics = [['dt_classifier','accuracy',dt_classifier_accuracy], 
                                  ['dt_classifier','f1_score',dt_classifier_f1_score],
                                  ['dt_classifier','precision', dt_classifier_precision],
                                  ['dt_classifier','recall', dt_classifier_recall],
                                  ['dt_classifier','roc_auc_score', dt_classifier_roc_auc_score]]
dt_classifier_performance_metrics = pd.DataFrame(dt_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
dt_classifier_roc_auc_plot, dt_classifier_roc_auc_plot_ax = plt.subplots()
dt_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
dt_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
dt_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
dt_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
dt_classifier_roc_auc_plot_ax.set_title(f'dt_classifier ROC Curve')
# Add legend
dt_classifier_roc_auc_plot_ax.legend()


print(dt_classifier_performance_metrics[dt_classifier_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = dt_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
dt_classifier_lift_plot, dt_classifier_lift_plot_ax = plt.subplots()
dt_classifier_lift_plot_ax.set_xlabel('Proportion')
dt_classifier_lift_plot_ax.set_ylabel('Lift')
dt_classifier_lift_plot_ax.set_title(f'dt_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
dt_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
dt_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(dt_classifier_performance_metrics)##### End of Model Pipeline for Decision Tree Classifier #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'roc_auc_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')


MULTIPLE CLASSIFICATION MODEL

Data Load and Model Selection

In [30]:
# code output
clf_pypelines_all = pipe.SupervisedPipeline(data = titanic,target = 'Survived',predictions_data=titanic
                            , model_type = 'classification'
                            , models = ['Logistic Regression', 'SVC Classifier']
                            , nfolds = 5)

Default Hyperparameters

In [None]:
clf_pypelines_all.get_hyperparameters()

Model tranining code generation

In [32]:
clf_pypelines_all.code_to_clipboard()

Training code for multiple classification model

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# target dataframe: titanic
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

prediction_df = titanic

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, titanic.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if titanic[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if titanic[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = titanic[features]
y = titanic[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####


##### Model Pipeline for Logistic Regression #####

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
log_reg_param_grid = {
"log_reg__C": np.arange(0.1, 1.0, 0.1),
}


# Create the pipeline
log_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('log_reg', LogisticRegression())
])

# Create the grid search
log_reg_grid_search = GridSearchCV(estimator=log_reg_pipe, param_grid=log_reg_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
log_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
log_reg_best_estimator = log_reg_grid_search.best_estimator_

# Store results as a dataframe  
log_reg_search_results = pd.DataFrame(log_reg_grid_search.cv_results_)

# Model metrics

# Generate Predictions
log_reg_predictions = pd.DataFrame(log_reg_best_estimator.predict(X_test))

log_reg_predictions_prob = log_reg_best_estimator.predict_proba(X_test)
log_reg_predictions_prob_df = pd.DataFrame()
log_reg_predictions_prob_df[log_reg_grid_search.classes_[0]] = log_reg_predictions_prob[:,0]
log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]] = log_reg_predictions_prob[:,1] 


# Generate Model Metrics
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_f1_score = f1_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_precision = precision_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_recall = recall_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_roc_auc_score = roc_auc_score(y_test, log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]])
log_reg_performance_metrics = [['log_reg','accuracy',log_reg_accuracy], 
                                  ['log_reg','f1_score',log_reg_f1_score],
                                  ['log_reg','precision', log_reg_precision],
                                  ['log_reg','recall', log_reg_recall],
                                  ['log_reg','roc_auc_score', log_reg_roc_auc_score]]
log_reg_performance_metrics = pd.DataFrame(log_reg_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
log_reg_roc_auc_plot, log_reg_roc_auc_plot_ax = plt.subplots()
log_reg_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
log_reg_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
log_reg_roc_auc_plot_ax.set_xlabel('False Positive Rate')
log_reg_roc_auc_plot_ax.set_ylabel('True Positive Rate')
log_reg_roc_auc_plot_ax.set_title(f'log_reg ROC Curve')
# Add legend
log_reg_roc_auc_plot_ax.legend()


print(log_reg_performance_metrics[log_reg_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = log_reg_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
log_reg_lift_plot, log_reg_lift_plot_ax = plt.subplots()
log_reg_lift_plot_ax.set_xlabel('Proportion')
log_reg_lift_plot_ax.set_ylabel('Lift')
log_reg_lift_plot_ax.set_title(f'log_reg Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
log_reg_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
log_reg_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(log_reg_performance_metrics)##### End of Model Pipeline for Logistic Regression #####
##### Model Pipeline for SVC Classifier #####

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
svc_classifier_param_grid = {
"svc_classifier__C": np.arange(0.1, 1.0, 0.2),
"svc_classifier__degree": np.arange(2, 5, 1),
"svc_classifier__kernel": ['linear'],
"svc_classifier__probability": [True],
}


# Create the pipeline
svc_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('svc_classifier', SVC())
])

# Create the grid search
svc_classifier_grid_search = GridSearchCV(estimator=svc_classifier_pipe, param_grid=svc_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
svc_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
svc_classifier_best_estimator = svc_classifier_grid_search.best_estimator_

# Store results as a dataframe  
svc_classifier_search_results = pd.DataFrame(svc_classifier_grid_search.cv_results_)

# Model metrics

# Generate Predictions
svc_classifier_predictions = pd.DataFrame(svc_classifier_best_estimator.predict(X_test))

svc_classifier_predictions_prob = svc_classifier_best_estimator.predict_proba(X_test)
svc_classifier_predictions_prob_df = pd.DataFrame()
svc_classifier_predictions_prob_df[svc_classifier_grid_search.classes_[0]] = svc_classifier_predictions_prob[:,0]
svc_classifier_predictions_prob_df[svc_classifier_grid_search.classes_[1]] = svc_classifier_predictions_prob[:,1] 


# Generate Model Metrics
svc_classifier_accuracy = accuracy_score(y_test, svc_classifier_predictions.iloc[:,0])
svc_classifier_f1_score = f1_score(y_test, svc_classifier_predictions.iloc[:,0])
svc_classifier_precision = precision_score(y_test, svc_classifier_predictions.iloc[:,0])
svc_classifier_recall = recall_score(y_test, svc_classifier_predictions.iloc[:,0])
svc_classifier_roc_auc_score = roc_auc_score(y_test, svc_classifier_predictions_prob_df[svc_classifier_grid_search.classes_[1]])
svc_classifier_performance_metrics = [['svc_classifier','accuracy',svc_classifier_accuracy], 
                                  ['svc_classifier','f1_score',svc_classifier_f1_score],
                                  ['svc_classifier','precision', svc_classifier_precision],
                                  ['svc_classifier','recall', svc_classifier_recall],
                                  ['svc_classifier','roc_auc_score', svc_classifier_roc_auc_score]]
svc_classifier_performance_metrics = pd.DataFrame(svc_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, svc_classifier_predictions_prob_df[svc_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
svc_classifier_roc_auc_plot, svc_classifier_roc_auc_plot_ax = plt.subplots()
svc_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
svc_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
svc_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
svc_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
svc_classifier_roc_auc_plot_ax.set_title(f'svc_classifier ROC Curve')
# Add legend
svc_classifier_roc_auc_plot_ax.legend()


print(svc_classifier_performance_metrics[svc_classifier_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = svc_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
svc_classifier_lift_plot, svc_classifier_lift_plot_ax = plt.subplots()
svc_classifier_lift_plot_ax.set_xlabel('Proportion')
svc_classifier_lift_plot_ax.set_ylabel('Lift')
svc_classifier_lift_plot_ax.set_title(f'svc_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
svc_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
svc_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(svc_classifier_performance_metrics)##### End of Model Pipeline for SVC Classifier #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'roc_auc_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')


CLASSIFICATION MODEL - DEFAULT RUN

In [None]:
clf_pypelines_all = pipe.SupervisedPipeline(data = titanic,target = 'Survived',predictions_data=titanic
                            , model_type = 'classification'
                            , nfolds = 5)

Default Hyperparameters

In [None]:
clf_pypelines_all.get_hyperparameters()
clf_pypelines_all.model_list()

Model tranining code generation

In [25]:
clf_pypelines_all.code_to_clipboard()

Model tranining code generation for classification default run

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# target dataframe: titanic
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

prediction_df = titanic

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, titanic.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if titanic[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if titanic[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = titanic[features]
y = titanic[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####


##### Model Pipeline for Decision Tree Classifier #####

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
dt_classifier_param_grid = {
"dt_classifier__max_depth": np.arange(2, 10, 2),
"dt_classifier__min_samples_split": np.arange(2, 10, 2),
"dt_classifier__min_samples_leaf": np.arange(1, 10, 5),
"dt_classifier__min_weight_fraction_leaf": np.arange(0.0, 0.5, 0.25),
"dt_classifier__max_leaf_nodes": np.arange(1, 10, 5),
"dt_classifier__min_impurity_decrease": np.arange(0.0, 0.5, 0.25),
}


# Create the pipeline
dt_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('dt_classifier', DecisionTreeClassifier())
])

# Create the grid search
dt_classifier_grid_search = GridSearchCV(estimator=dt_classifier_pipe, param_grid=dt_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
dt_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
dt_classifier_best_estimator = dt_classifier_grid_search.best_estimator_

# Store results as a dataframe  
dt_classifier_search_results = pd.DataFrame(dt_classifier_grid_search.cv_results_)

# Model metrics

# Generate Predictions
dt_classifier_predictions = pd.DataFrame(dt_classifier_best_estimator.predict(X_test))

dt_classifier_predictions_prob = dt_classifier_best_estimator.predict_proba(X_test)
dt_classifier_predictions_prob_df = pd.DataFrame()
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[0]] = dt_classifier_predictions_prob[:,0]
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]] = dt_classifier_predictions_prob[:,1] 


# Generate Model Metrics
dt_classifier_accuracy = accuracy_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_f1_score = f1_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_precision = precision_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_recall = recall_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_roc_auc_score = roc_auc_score(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
dt_classifier_performance_metrics = [['dt_classifier','accuracy',dt_classifier_accuracy], 
                                  ['dt_classifier','f1_score',dt_classifier_f1_score],
                                  ['dt_classifier','precision', dt_classifier_precision],
                                  ['dt_classifier','recall', dt_classifier_recall],
                                  ['dt_classifier','roc_auc_score', dt_classifier_roc_auc_score]]
dt_classifier_performance_metrics = pd.DataFrame(dt_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
dt_classifier_roc_auc_plot, dt_classifier_roc_auc_plot_ax = plt.subplots()
dt_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
dt_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
dt_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
dt_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
dt_classifier_roc_auc_plot_ax.set_title(f'dt_classifier ROC Curve')
# Add legend
dt_classifier_roc_auc_plot_ax.legend()


print(dt_classifier_performance_metrics[dt_classifier_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = dt_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
dt_classifier_lift_plot, dt_classifier_lift_plot_ax = plt.subplots()
dt_classifier_lift_plot_ax.set_xlabel('Proportion')
dt_classifier_lift_plot_ax.set_ylabel('Lift')
dt_classifier_lift_plot_ax.set_title(f'dt_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
dt_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
dt_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(dt_classifier_performance_metrics)##### End of Model Pipeline for Decision Tree Classifier #####
##### Model Pipeline for Logistic Regression #####

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
log_reg_param_grid = {
"log_reg__C": np.arange(0.1, 1.0, 0.1),
}


# Create the pipeline
log_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('log_reg', LogisticRegression())
])

# Create the grid search
log_reg_grid_search = GridSearchCV(estimator=log_reg_pipe, param_grid=log_reg_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
log_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
log_reg_best_estimator = log_reg_grid_search.best_estimator_

# Store results as a dataframe  
log_reg_search_results = pd.DataFrame(log_reg_grid_search.cv_results_)

# Model metrics

# Generate Predictions
log_reg_predictions = pd.DataFrame(log_reg_best_estimator.predict(X_test))

log_reg_predictions_prob = log_reg_best_estimator.predict_proba(X_test)
log_reg_predictions_prob_df = pd.DataFrame()
log_reg_predictions_prob_df[log_reg_grid_search.classes_[0]] = log_reg_predictions_prob[:,0]
log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]] = log_reg_predictions_prob[:,1] 


# Generate Model Metrics
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_f1_score = f1_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_precision = precision_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_recall = recall_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_roc_auc_score = roc_auc_score(y_test, log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]])
log_reg_performance_metrics = [['log_reg','accuracy',log_reg_accuracy], 
                                  ['log_reg','f1_score',log_reg_f1_score],
                                  ['log_reg','precision', log_reg_precision],
                                  ['log_reg','recall', log_reg_recall],
                                  ['log_reg','roc_auc_score', log_reg_roc_auc_score]]
log_reg_performance_metrics = pd.DataFrame(log_reg_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
log_reg_roc_auc_plot, log_reg_roc_auc_plot_ax = plt.subplots()
log_reg_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
log_reg_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
log_reg_roc_auc_plot_ax.set_xlabel('False Positive Rate')
log_reg_roc_auc_plot_ax.set_ylabel('True Positive Rate')
log_reg_roc_auc_plot_ax.set_title(f'log_reg ROC Curve')
# Add legend
log_reg_roc_auc_plot_ax.legend()


print(log_reg_performance_metrics[log_reg_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = log_reg_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
log_reg_lift_plot, log_reg_lift_plot_ax = plt.subplots()
log_reg_lift_plot_ax.set_xlabel('Proportion')
log_reg_lift_plot_ax.set_ylabel('Lift')
log_reg_lift_plot_ax.set_title(f'log_reg Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
log_reg_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
log_reg_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(log_reg_performance_metrics)##### End of Model Pipeline for Logistic Regression #####
##### Model Pipeline for Random Forest Classifier #####

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
random_forest_classifier_param_grid = {
"random_forest_classifier__n_estimators": np.arange(10, 100, 20),
"random_forest_classifier__max_depth": np.arange(2, 10, 2),
"random_forest_classifier__min_samples_split": np.arange(0.5, 1.0, 0.1),
"random_forest_classifier__min_samples_leaf": np.arange(1, 10, 2),
}


# Create the pipeline
random_forest_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest_classifier', RandomForestClassifier())
])

# Create the grid search
random_forest_classifier_grid_search = GridSearchCV(estimator=random_forest_classifier_pipe, param_grid=random_forest_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
random_forest_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
random_forest_classifier_best_estimator = random_forest_classifier_grid_search.best_estimator_

# Store results as a dataframe  
random_forest_classifier_search_results = pd.DataFrame(random_forest_classifier_grid_search.cv_results_)

# Model metrics

# Generate Predictions
random_forest_classifier_predictions = pd.DataFrame(random_forest_classifier_best_estimator.predict(X_test))

random_forest_classifier_predictions_prob = random_forest_classifier_best_estimator.predict_proba(X_test)
random_forest_classifier_predictions_prob_df = pd.DataFrame()
random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[0]] = random_forest_classifier_predictions_prob[:,0]
random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[1]] = random_forest_classifier_predictions_prob[:,1] 


# Generate Model Metrics
random_forest_classifier_accuracy = accuracy_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_f1_score = f1_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_precision = precision_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_recall = recall_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_roc_auc_score = roc_auc_score(y_test, random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[1]])
random_forest_classifier_performance_metrics = [['random_forest_classifier','accuracy',random_forest_classifier_accuracy], 
                                  ['random_forest_classifier','f1_score',random_forest_classifier_f1_score],
                                  ['random_forest_classifier','precision', random_forest_classifier_precision],
                                  ['random_forest_classifier','recall', random_forest_classifier_recall],
                                  ['random_forest_classifier','roc_auc_score', random_forest_classifier_roc_auc_score]]
random_forest_classifier_performance_metrics = pd.DataFrame(random_forest_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
random_forest_classifier_roc_auc_plot, random_forest_classifier_roc_auc_plot_ax = plt.subplots()
random_forest_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
random_forest_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
random_forest_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
random_forest_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
random_forest_classifier_roc_auc_plot_ax.set_title(f'random_forest_classifier ROC Curve')
# Add legend
random_forest_classifier_roc_auc_plot_ax.legend()


print(random_forest_classifier_performance_metrics[random_forest_classifier_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = random_forest_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
random_forest_classifier_lift_plot, random_forest_classifier_lift_plot_ax = plt.subplots()
random_forest_classifier_lift_plot_ax.set_xlabel('Proportion')
random_forest_classifier_lift_plot_ax.set_ylabel('Lift')
random_forest_classifier_lift_plot_ax.set_title(f'random_forest_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
random_forest_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
random_forest_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(random_forest_classifier_performance_metrics)##### End of Model Pipeline for Random Forest Classifier #####
##### Model Pipeline for XGBoost Classifier #####

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
xgboost_classifier_param_grid = {
"xgboost_classifier__learning_rate": np.arange(0.1, 1.0, 0.25),
"xgboost_classifier__n_estimators": np.arange(100, 500, 100),
"xgboost_classifier__max_depth": np.arange(2, 10, 2),
"xgboost_classifier__gamma": np.arange(0.0, 0.5, 0.25),
"xgboost_classifier__subsample": np.arange(0.1, 1.0, 0.25),
"xgboost_classifier__colsample_bytree": np.arange(0.5, 1.0, 0.25),
}


# Create the pipeline
xgboost_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgboost_classifier', XGBClassifier())
])

# Create the grid search
xgboost_classifier_grid_search = GridSearchCV(estimator=xgboost_classifier_pipe, param_grid=xgboost_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
xgboost_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
xgboost_classifier_best_estimator = xgboost_classifier_grid_search.best_estimator_

# Store results as a dataframe  
xgboost_classifier_search_results = pd.DataFrame(xgboost_classifier_grid_search.cv_results_)

# Model metrics

# Generate Predictions
xgboost_classifier_predictions = pd.DataFrame(xgboost_classifier_best_estimator.predict(X_test))

xgboost_classifier_predictions_prob = xgboost_classifier_best_estimator.predict_proba(X_test)
xgboost_classifier_predictions_prob_df = pd.DataFrame()
xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[0]] = xgboost_classifier_predictions_prob[:,0]
xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[1]] = xgboost_classifier_predictions_prob[:,1] 


# Generate Model Metrics
xgboost_classifier_accuracy = accuracy_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_f1_score = f1_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_precision = precision_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_recall = recall_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_roc_auc_score = roc_auc_score(y_test, xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[1]])
xgboost_classifier_performance_metrics = [['xgboost_classifier','accuracy',xgboost_classifier_accuracy], 
                                  ['xgboost_classifier','f1_score',xgboost_classifier_f1_score],
                                  ['xgboost_classifier','precision', xgboost_classifier_precision],
                                  ['xgboost_classifier','recall', xgboost_classifier_recall],
                                  ['xgboost_classifier','roc_auc_score', xgboost_classifier_roc_auc_score]]
xgboost_classifier_performance_metrics = pd.DataFrame(xgboost_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
xgboost_classifier_roc_auc_plot, xgboost_classifier_roc_auc_plot_ax = plt.subplots()
xgboost_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
xgboost_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
xgboost_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
xgboost_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
xgboost_classifier_roc_auc_plot_ax.set_title(f'xgboost_classifier ROC Curve')
# Add legend
xgboost_classifier_roc_auc_plot_ax.legend()


print(xgboost_classifier_performance_metrics[xgboost_classifier_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = xgboost_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
xgboost_classifier_lift_plot, xgboost_classifier_lift_plot_ax = plt.subplots()
xgboost_classifier_lift_plot_ax.set_xlabel('Proportion')
xgboost_classifier_lift_plot_ax.set_ylabel('Lift')
xgboost_classifier_lift_plot_ax.set_title(f'xgboost_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
xgboost_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
xgboost_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(xgboost_classifier_performance_metrics)##### End of Model Pipeline for XGBoost Classifier #####
##### Model Pipeline for GBT Classifier #####

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
gbt_classifier_param_grid = {
"gbt_classifier__learning_rate": np.arange(0.0, 1.0, 0.2),
"gbt_classifier__n_estimators": np.arange(100, 10000, 1000),
"gbt_classifier__subsample": np.arange(0.1, 1.0, 0.2),
"gbt_classifier__max_depth": np.arange(1, 10000, 1000),
}


# Create the pipeline
gbt_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('gbt_classifier', GradientBoostingClassifier())
])

# Create the grid search
gbt_classifier_grid_search = GridSearchCV(estimator=gbt_classifier_pipe, param_grid=gbt_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=3)
gbt_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
gbt_classifier_best_estimator = gbt_classifier_grid_search.best_estimator_

# Store results as a dataframe  
gbt_classifier_search_results = pd.DataFrame(gbt_classifier_grid_search.cv_results_)

# Model metrics

# Generate Predictions
gbt_classifier_predictions = pd.DataFrame(gbt_classifier_best_estimator.predict(X_test))

gbt_classifier_predictions_prob = gbt_classifier_best_estimator.predict_proba(X_test)
gbt_classifier_predictions_prob_df = pd.DataFrame()
gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[0]] = gbt_classifier_predictions_prob[:,0]
gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[1]] = gbt_classifier_predictions_prob[:,1] 


# Generate Model Metrics
gbt_classifier_accuracy = accuracy_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_f1_score = f1_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_precision = precision_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_recall = recall_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_roc_auc_score = roc_auc_score(y_test, gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[1]])
gbt_classifier_performance_metrics = [['gbt_classifier','accuracy',gbt_classifier_accuracy], 
                                  ['gbt_classifier','f1_score',gbt_classifier_f1_score],
                                  ['gbt_classifier','precision', gbt_classifier_precision],
                                  ['gbt_classifier','recall', gbt_classifier_recall],
                                  ['gbt_classifier','roc_auc_score', gbt_classifier_roc_auc_score]]
gbt_classifier_performance_metrics = pd.DataFrame(gbt_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)

# ROC Curve plot
gbt_classifier_roc_auc_plot, gbt_classifier_roc_auc_plot_ax = plt.subplots()
gbt_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
gbt_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')
# Set axis labels and title
gbt_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
gbt_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
gbt_classifier_roc_auc_plot_ax.set_title(f'gbt_classifier ROC Curve')
# Add legend
gbt_classifier_roc_auc_plot_ax.legend()


print(gbt_classifier_performance_metrics[gbt_classifier_performance_metrics['metric'] == 'roc_auc_score'])

# Lift Chart
aux_df = pd.DataFrame()
aux_df['y_real'] = y_test
aux_df['y_proba'] = gbt_classifier_predictions_prob_df.iloc[:,1].values

# Sort by predicted probability
aux_df = aux_df.sort_values('y_proba', ascending=False)

# Find the total positive ratio of the whole dataset
total_positive_ratio = sum(aux_df['y_real'] == 1) / aux_df.shape[0]

# For each line of data, get the ratio of positives of the given subset and calculate the lift
lift_values = []
for i in aux_df.index:
    threshold = aux_df.loc[i]['y_proba']
    subset = aux_df[aux_df['y_proba'] >= threshold]
    subset_positive_ratio = sum(subset['y_real'] == 1) / subset.shape[0]
    lift = subset_positive_ratio / total_positive_ratio
    lift_values.append(lift)

# Plot the lift curve
gbt_classifier_lift_plot, gbt_classifier_lift_plot_ax = plt.subplots()
gbt_classifier_lift_plot_ax.set_xlabel('Proportion')
gbt_classifier_lift_plot_ax.set_ylabel('Lift')
gbt_classifier_lift_plot_ax.set_title(f'gbt_classifier Lift Curve')

# plot the lift curve
x_vals = np.linspace(0, 1, num=len(lift_values))
gbt_classifier_lift_plot_ax.plot(x_vals, lift_values, color='b')

# add dashed horizontal line at lift of 1
gbt_classifier_lift_plot_ax.axhline(y=1, color='gray', linestyle='--', linewidth=3)


model_comparison_list.append(gbt_classifier_performance_metrics)##### End of Model Pipeline for GBT Classifier #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'roc_auc_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')
