In [1]:
import os
import sys
sys.path.insert(1, os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

In [26]:
from sklearn.model_selection import train_test_split
from collections import Counter
from mgs_grf import MGSGRFOverSampler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import lightgbm as lgb
from sklearn.metrics import average_precision_score

In [27]:
## Import of the original data
from experiments.data.data import load_BankChurners_data_

X, y = load_BankChurners_data_()
numeric_features = [0,2,7,8,9,10,11,12,13,14,15,16,17,18]
categorical_features = [1,3,4,5,6]

In [28]:
## Splitting the data intro train and test set
X_train_imbalanced, X_test, y_train_imbalanced, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Bankchurners 1% imbalance ratio : ", Counter(y_train_imbalanced))

Bankchurners 1% imbalance ratio :  Counter({np.int64(0): 6802, np.int64(1): 66})


In [29]:
## Apply MGS-GRF procedure to oversample the data
mgs_grf = MGSGRFOverSampler(categorical_features=categorical_features, random_state=0)
X_train_balanced, y_train_balanced = mgs_grf.fit_resample(X_train_imbalanced, y_train_imbalanced)
print("Augmented data : ", Counter(y_train_balanced))



Augmented data :  Counter({np.int64(0): 6802, np.int64(1): 6802})


In [30]:
## Encode the categorical variables
enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_train_balanced_enc = np.hstack((X_train_balanced[:,numeric_features],
                                  enc.fit_transform(X_train_balanced[:,categorical_features])))
X_test_enc = np.hstack((X_test[:,numeric_features], enc.transform(X_test[:,categorical_features])))

# Fit the final classifier on the augmented data
clf = lgb.LGBMClassifier(n_estimators=100, verbosity=-1, random_state=0)
clf.fit(X_train_balanced_enc, y_train_balanced)

## Evaluation on test set
y_pred_probas_mgs_grf = clf.predict_proba(X_test_enc)[:,1]
print("PR AUC induced by MGS-GRF : ", average_precision_score(y_test, y_pred_probas_mgs_grf))

PR AUC induced by MGS-GRF :  0.5767656364976163




In [31]:
## Comparison when applying no rebalancing strategy
X_train_imbalanced_enc = np.hstack((X_train_imbalanced[:,numeric_features],
                                    enc.fit_transform(X_train_imbalanced[:,categorical_features])))
clf_imbalanced = lgb.LGBMClassifier(n_estimators=100, verbosity=-1, random_state=0)
clf_imbalanced.fit(X_train_imbalanced_enc, y_train_imbalanced)

y_pred_probas_imbalanced = clf_imbalanced.predict_proba(X_test_enc)[:,1]
print("PR AUC induced by applying no rebalancing strategy : ", average_precision_score(y_test, y_pred_probas_imbalanced))

PR AUC induced by applying no rebalancing strategy :  0.401426481385112


