In [1]:
import json
import re
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split

data = pd.read_csv("./clean_data.csv")
targets = data.pop(list(data.columns)[-1])


In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    data, targets, test_size=0.3, random_state=121, stratify=targets
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.67, random_state=121, stratify=y_test
)
data = {}
data["train"] = [X_train, y_train]
data["val"] = [X_val, y_val]
data["test"] = [X_test, y_test]

unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))
class_weights = {}
total_samples = np.sum(counts)
for cls in class_counts:
    class_weights[cls] = total_samples / (len(class_counts) * class_counts[cls])
sample_weights = np.array([class_weights[cls] for cls in y_train])


In [3]:
def compute_metrics_sklearn(X, y, clf):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)[:, 1]

    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    auc_roc = roc_auc_score(y, y_pred_prob)
    auc_pr = average_precision_score(y, y_pred_prob)

    metrics_dict = {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc_roc": auc_roc,
        "auc_pr": auc_pr,
    }
    return metrics_dict


# Testing a basic GBDT as a baseline
clf_1 = HistGradientBoostingClassifier()

clf_1.fit(X_train, y_train, sample_weight=sample_weights)
compute_metrics_sklearn(X=X_test, y=y_test, clf=clf_1)


{'precision': 0.4168126094570928,
 'recall': 0.7555555555555555,
 'f1_score': 0.5372460496613995,
 'auc_roc': 0.954606079602066,
 'auc_pr': 0.7111286389430058}

### Meat of the stuff here

In [7]:
from predictors import XGBoostClassifier, LightGBMClassifier, WeightedEnsembleClassifier
from custom_voter_boost_all_data import all_estimators, compute_metrics

xgb_1 = XGBoostClassifier(data=data)
xgb_1.fit(None, None)

pprint({"Training Subset": f"{xgb_1.score(X=data['train'][0],y=data['train'][1])}"})
pprint({"Validation Subset": f"{xgb_1.score(X=data['val'][0],y=data['val'][1])}"})


[0]	train-aucpr:0.54374	validation-aucpr:0.51460
[9]	train-aucpr:0.85679	validation-aucpr:0.66455
{'Training Subset': "{'precision': 0.43474646716541976, 'recall': "
                    "0.9526411657559198, 'f1_score': 0.5970319634703195, "
                    "'auc_roc': 0.9874339699113004, 'auc_pr': "
                    "0.870033272836601, 'threshold': 0.5278838}"}
{'Validation Subset': "{'precision': 0.33620689655172414, 'recall': "
                      "0.7548387096774194, 'f1_score': 0.46520874751491054, "
                      "'auc_roc': 0.9405523943750855, 'auc_pr': "
                      "0.6720859202648976, 'threshold': 0.5278838}"}


In [8]:
lgbm_1 = LightGBMClassifier(data=data)
lgbm_1.fit(None, None)
pprint({"Training Subset": f"{xgb_1.score(X=data['train'][0],y=data['train'][1])}"})
pprint({"Validation Subset": f"{xgb_1.score(X=data['val'][0],y=data['val'][1])}"})


[LightGBM] [Info] Number of positive: 1098, number of negative: 21689
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 22787, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048185 -> initscore=-2.983315
[LightGBM] [Info] Start training from score -2.983315
{'Training Subset': "{'precision': 0.43474646716541976, 'recall': "
                    "0.9526411657559198, 'f1_score': 0.5970319634703195, "
                    "'auc_roc': 0.9874339699113004, 'auc_pr': "
                    "0.870033272836601, 'threshold': 0.5278838}"}
{'Validation Subset': "{'precision': 0.33620689655172414, 'recall': "
                      "0.7548387096774194, 'f1_score': 0.46520874751491054, "
                      "'auc_roc': 0.9405523943750855, 'auc_pr': "
                      "0.6720859202648976, 'threshold': 0.5278838}"}


In [9]:
voter = WeightedEnsembleClassifier(estimators=all_estimators, weights=None)
wts, mean_threshold = voter.score(data["val"][0], data["val"][1])
voter_2 = WeightedEnsembleClassifier(estimators=all_estimators, weights=wts)
preds = voter_2.predict(X=data["test"][0], threshold=mean_threshold)
pprint(compute_metrics(y_pred=preds, y_test=data["test"][1]))


{'f1_score': 0.6894075403949731,
 'precision': 0.7933884297520661,
 'recall': 0.6095238095238096}
