In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, chi2_contingency, ttest_ind
# automated feature engineering
import featuretools as ft

# Filter out pandas warnings
import warnings 
warnings.filterwarnings('ignore')

start = "\033[1m"  # Bold text
end = "\033[0;0m"  # Reset text
from IPython.core.interactiveshell import InteractiveShell
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore', UserWarning)
InteractiveShell.ast_node_interactivity = "all"
import sys
sys.path.append('../')
from helpers import *

In [8]:
pd.options.display.max_rows, pd.options.display.max_columns

# https://thispointer.com/python-pandas-how-to-display-full-dataframe-i-e-print-all-rows-columns-without-truncation/
# Print all the contents of a pandas dataframe
pd.set_option(
    "display.max_rows", None
)  # Print unlimited number of rows by setting to None, default is 10
pd.set_option(
    "display.max_columns", None
)  # Do not truncate columns to display all of them by setting to None
pd.set_option(
    "display.width", None
)  # Auto-detect the width of dataframe to display all columns in single line by setting to None
pd.set_option(
    "display.max_colwidth", None
)  # Auto detect the max size of column and print contents of that column without truncation

(None, None)

In [9]:
X_train = pd.read_csv("../training_data/X_train.csv")
X_test = pd.read_csv("../training_data/X_test.csv")
y_train = pd.read_csv("../training_data/y_train.csv")
y_test = pd.read_csv("../training_data/y_test.csv")

## Baseline Models

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score
import pandas as pd
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier

def evaluate_model(model, model_params, X_train, y_train, X_test, y_test):
    # Adjust model with provided parameters
    model.set_params(**model_params)
    
    # Impute missing values
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Train the model
    model.fit(X_train_imputed, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_imputed)
    y_pred_proba = model.predict_proba(X_test_imputed)[:, 1]
    
    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Convert the classification report into a DataFrame
    metrics_df = pd.DataFrame(report).transpose()
    
    # Add accuracy and AUC to the DataFrame
    metrics_df.loc['Accuracy', 'precision'] = accuracy
    metrics_df.loc['AUC', 'precision'] = auc_score
    metrics_df.loc[['Accuracy', 'AUC'], ['recall', 'f1-score']] = np.nan  # Set non-applicable columns as NaN
    # Add the model name in the df
    metrics_df['model'] = model.__class__.__name__
    
    return metrics_df



In [11]:
# import time
# start = time.time()
# xgb_params = {'tree_method': 'gpu_hist'}
# xgb_metrics_df = evaluate_model(XGBClassifier(random_state=50), xgb_params, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[XGB] Time taken in seconds: ", end - start)

# # RandomForestClassifier using 4 cores
# start = time.time()
# rf_params = {'n_jobs': 4}
# rf_metrics_df = evaluate_model(RandomForestClassifier(random_state=50), rf_params, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[RF] Time taken in seconds: ", end - start)

# # LightGBM
# start = time.time()
# lgb_params = {'objective': 'binary', 'metric': 'auc', 'seed': 50, 'verbose': -1}
# lgb_metrics_df = evaluate_model(lgb.LGBMClassifier(), lgb_params, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[LGBM] Time taken in seconds: ", end - start)

# # CART (Decision Tree)
# start = time.time()
# cart_metrics_df = evaluate_model(DecisionTreeClassifier(random_state=50), {}, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[CART] Time taken in seconds: ", end - start)

# # Extra Trees
# start = time.time()
# et_metrics_df = evaluate_model(ExtraTreesClassifier(random_state=50), {}, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[ExtraTrees] Time taken in seconds: ", end - start)

# # CatBoost
# start = time.time()
# catboost_params = {
#     'task_type': 'GPU',
#     'devices': '0',  # Specifies the GPU ID to use. For multiple GPUs, use '0:1:2' for GPUs 0, 1, and 2, for example.
#     'random_seed': 50, 
#     'silent': True
# }

# # Evaluate CatBoost model
# catboost_metrics_df = evaluate_model(CatBoostClassifier(), catboost_params, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[CB] Time taken in seconds: ", end - start)

# #adaboost
# start = time.time()
# ada_metrics_df = evaluate_model(AdaBoostClassifier(random_state=50),{}, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[ADA] Time taken in seconds: ", end - start)

# #ID3
# start = time.time()
# dt_params = {'criterion': 'entropy', 'random_state': 50}
# id3_metrics_df = evaluate_model(DecisionTreeClassifier(), dt_params, X_train, y_train, X_test, y_test)
# end = time.time()
# print("[ID3] Time taken in seconds: ", end - start)
# print(id3_metrics_df)

# xgb_metrics_df
# rf_metrics_df
# lgb_metrics_df
# catboost_metrics_df
# et_metrics_df
# cart_metrics_df
# ada_metrics_df
# id3_metrics_df

#### XGB fine-tuning

In [12]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from skopt.callbacks import DeltaYStopper
import time
import matplotlib.pyplot as plt


class IterationTrackingCallback:
    def __init__(self):
        self.start_time = time.time()
        # Lists to store scores and parameter sets
        self.best_scores = []
        self.best_parameters = []
    
    def __call__(self, res):
        iteration_time = time.time() - self.start_time
        print(f"\nIteration completed in {iteration_time:.2f} seconds.")
        
        # Store the negated best score so far to correct its sign
        best_score_so_far = -res.fun
        self.best_scores.append(best_score_so_far)
        print(f"Best score so far: {best_score_so_far:.4f}")

        # Extract and store the best parameters so far
        best_params_so_far = dict(zip(res.space.dimension_names, res.x))
        self.best_parameters.append(best_params_so_far)
        print("Best parameters so far:")
        for param_name, param_value in best_params_so_far.items():
            print(f"{param_name}: {param_value}")

        self.start_time = time.time()



# Initialize the callback
iteration_callback = IterationTrackingCallback()

# Define your XGBClassifier and BayesSearchCV as before
model = XGBClassifier(tree_method='gpu_hist', random_state=42)

search_spaces = {
    'n_estimators': Integer(250, 300),
    'max_depth': Integer(3, 6),
    'learning_rate': Real(0.05, 0.2, 'log-uniform'),
    'subsample': Real(0.85, 1.0),
    'colsample_bytree': Real(0.7, 1.0),
    'colsample_bylevel': Real(0.7, 1.0),  # Added based on your current range
    'colsample_bynode': Real(0.7, 1.0),  # Added for experimentation
    'min_child_weight': Integer(1, 3),
    'gamma': Real(0, 5),
    'reg_alpha': Real(1e-5, 1.0, 'log-uniform'),
    'reg_lambda': Real(1e-5, 1.0, 'log-uniform'),
}


# Define your XGBClassifier and BayesSearchCV as before
model = XGBClassifier(tree_method='gpu_hist', random_state=42)
opt = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    n_iter=30,  # Reduced iterations for speed
    scoring='roc_auc',
    cv=5,
    n_jobs=1,  # Keep as 1 for GPU usage
    return_train_score=True,
    refit=True,
    random_state=42
)
# Fit BayesSearchCV with the callback
opt.fit(X_train, y_train, callback=[iteration_callback])


# Plotting the improvement of scores
plt.figure(figsize=(10, 6))
plt.plot(iteration_callback.best_scores, marker='o')
plt.title('Improvement of Best AUC Score over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Best AUC Score')
plt.grid(True)
plt.show()




Iteration completed in 65.06 seconds.
Best score so far: 0.7830
Best parameters so far:
colsample_bylevel: 0.8230311876559941
colsample_bynode: 0.9183177229531976
colsample_bytree: 0.9798603996543502
gamma: 1.5789979674352437
learning_rate: 0.12660162270835032
max_depth: 4
min_child_weight: 2
n_estimators: 287
reg_alpha: 0.0003329021156509417
reg_lambda: 0.017336360995622795
subsample: 0.9324945912714676

Iteration completed in 58.80 seconds.
Best score so far: 0.7830
Best parameters so far:
colsample_bylevel: 0.8230311876559941
colsample_bynode: 0.9183177229531976
colsample_bytree: 0.9798603996543502
gamma: 1.5789979674352437
learning_rate: 0.12660162270835032
max_depth: 4
min_child_weight: 2
n_estimators: 287
reg_alpha: 0.0003329021156509417
reg_lambda: 0.017336360995622795
subsample: 0.9324945912714676

Iteration completed in 65.12 seconds.
Best score so far: 0.7831
Best parameters so far:
colsample_bylevel: 0.8334497536903456
colsample_bynode: 0.9756167565008131
colsample_bytree: 