In [1]:
import json
import re
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split

data = pd.read_csv("./clean_data_dropped.csv")
targets = data.pop(list(data.columns)[-1])


In [2]:
X_train, X_test, y_train, y_test = train_test_split(
    data, targets, test_size=0.3, random_state=121, stratify=targets
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.67, random_state=121, stratify=y_test
)
data = {}
data["train"] = [X_train, y_train]
data["val"] = [X_val, y_val]
data["test"] = [X_test, y_test]

unique, counts = np.unique(y_train, return_counts=True)
class_counts = dict(zip(unique, counts))
class_weights = {}
total_samples = np.sum(counts)
for cls in class_counts:
    class_weights[cls] = total_samples / (len(class_counts) * class_counts[cls])
sample_weights = np.array([class_weights[cls] for cls in y_train])


In [3]:
def compute_metrics_sklearn(X, y, clf):
    y_pred = clf.predict(X)
    y_pred_prob = clf.predict_proba(X)[:, 1]

    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    auc_roc = roc_auc_score(y, y_pred_prob)
    auc_pr = average_precision_score(y, y_pred_prob)

    metrics_dict = {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "auc_roc": auc_roc,
        "auc_pr": auc_pr,
    }
    return metrics_dict


# Testing a basic GBDT as a baseline
clf_1 = HistGradientBoostingClassifier()

clf_1.fit(X_train, y_train, sample_weight=sample_weights)
compute_metrics_sklearn(X=X_test, y=y_test, clf=clf_1)


{'precision': 0.2792207792207792,
 'recall': 0.7610619469026548,
 'f1_score': 0.4085510688836105,
 'auc_roc': 0.942224214617912,
 'auc_pr': 0.48108561828753127}

### Meat of the stuff here

In [7]:
from predictors import XGBoostClassifier, LightGBMClassifier, WeightedEnsembleClassifier
from custom_voter_boost_dropped_data import all_estimators, compute_metrics

xgb_1 = XGBoostClassifier(data=data)
xgb_1.fit(None, None)

pprint({"Training Subset": f"{xgb_1.score(X=data['train'][0],y=data['train'][1])}"})
pprint({"Validation Subset": f"{xgb_1.score(X=data['val'][0],y=data['val'][1])}"})


[0]	train-aucpr:0.32792	validation-aucpr:0.30732
[10]	train-aucpr:0.72797	validation-aucpr:0.34680
[13]	train-aucpr:0.83457	validation-aucpr:0.39438
{'Training Subset': "{'precision': 0.38109452736318405, 'recall': "
                    "0.9745547073791349, 'f1_score': 0.5479256080114449, "
                    "'auc_roc': 0.9944126549689318, 'auc_pr': "
                    "0.8572235459576539, 'threshold': 0.6092363}"}
{'Validation Subset': "{'precision': 0.2074074074074074, 'recall': "
                      "0.509090909090909, 'f1_score': 0.2947368421052632, "
                      "'auc_roc': 0.9095604649372642, 'auc_pr': "
                      "0.3975650317034448, 'threshold': 0.6092363}"}


In [11]:
lgbm_1 = LightGBMClassifier(data=data)
lgbm_1.fit(None, None)
pprint({"Training Subset": f"{xgb_1.score(X=data['train'][0],y=data['train'][1])}"})
pprint({"Validation Subset": f"{xgb_1.score(X=data['val'][0],y=data['val'][1])}"})


[LightGBM] [Info] Number of positive: 393, number of negative: 16701
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16065
[LightGBM] [Info] Number of data points in the train set: 17094, number of used features: 63
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.022991 -> initscore=-3.749414
[LightGBM] [Info] Start training from score -3.749414
{'Training Subset': "{'precision': 0.38109452736318405, 'recall': "
                    "0.9745547073791349, 'f1_score': 0.5479256080114449, "
                    "'auc_roc': 0.9944126549689318, 'auc_pr': "
                    "0.8572235459576539, 'threshold': 0.6092363}"}
{'Validation Subset': "{'precision': 0.2074074074074074, 'recall': "
                      "0.509090909090909, 'f1_score': 0.2947368421052632, "
                      "'auc_roc': 0.9095604649372642, 'auc_pr': "
                      "0.3975650317034448, 'threshold': 0.6092

In [12]:
voter = WeightedEnsembleClassifier(estimators=all_estimators, weights=None)
wts, mean_threshold = voter.score(data["val"][0], data["val"][1])
voter_2 = WeightedEnsembleClassifier(estimators=all_estimators, weights=wts)
preds = voter_2.predict(X=data["test"][0], threshold=mean_threshold)
pprint(compute_metrics(y_pred=preds, y_test=data["test"][1]))


{'f1_score': 0.46153846153846156,
 'precision': 0.8372093023255814,
 'recall': 0.3185840707964602}
