In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, BalancedBaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_csv('./data.csv', header=0, index_col=0)

In [3]:
enc = OneHotEncoder(handle_unknown='ignore')
tmp = pd.DataFrame(enc.fit_transform(df[['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                                         'NAME_HOUSING_TYPE']]).toarray())
tmp.columns = enc.get_feature_names(['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                                     'NAME_HOUSING_TYPE'])
tmp = tmp.set_index(df.index)
df = df.replace(['Y', 'F'], 1)
df = df.replace(['N', 'M'], 0)
df = df.join(tmp)

df = df.drop(columns=['OCCUPATION_TYPE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                      'NAME_HOUSING_TYPE'])

In [4]:
x = df.drop(['score', 'score2'], axis=1)
y = df['score2']

train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3)

In [5]:
# documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedRandomForestClassifier.html
clf = BalancedRandomForestClassifier(n_estimators=5000)
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86     10838
           1       0.03      0.69      0.05       100

    accuracy                           0.75     10938
   macro avg       0.51      0.72      0.45     10938
weighted avg       0.99      0.75      0.85     10938



In [6]:
roc_auc_score(test_y, pred_y)

0.7212225502860307

In [9]:
# documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedBaggingClassifier.html#imblearn.ensemble.BalancedBaggingClassifier
# base estimator: Decision Tree
clf3 = BalancedBaggingClassifier(n_estimators=5000)
clf3.fit(train_x, train_y)
pred3_y = clf3.predict(test_x)
print(classification_report(test_y, pred3_y))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91     10838
           1       0.03      0.57      0.06       100

    accuracy                           0.84     10938
   macro avg       0.51      0.71      0.49     10938
weighted avg       0.99      0.84      0.90     10938



In [10]:
roc_auc_score(test_y, pred3_y)

0.7053727625023066

In [11]:
# documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.EasyEnsembleClassifier.html#imblearn.ensemble.EasyEnsembleClassifier
# base estimator: AdaBoostClassifier
clf2 = EasyEnsembleClassifier(n_estimators=100)
clf2.fit(train_x, train_y)
pred2_y = clf2.predict(test_x)
print(classification_report(test_y, pred2_y))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80     10838
           1       0.02      0.67      0.04       100

    accuracy                           0.67     10938
   macro avg       0.51      0.67      0.42     10938
weighted avg       0.99      0.67      0.79     10938



In [12]:
roc_auc_score(test_y, pred2_y)

0.6696097065879313

Setting n_estimators to different values between 100 and 5000, every run of the above classifiers produced an AUC score ranging from 0.66 to 0.79 but not higher than that. Different parameters were used, including setting max_depth to different values for RandomForest, but the score never surpasses what has been encountered already.