In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [2]:
df = pd.read_csv('./data.csv', header=0, index_col=0)

In [3]:
enc = OneHotEncoder(handle_unknown='ignore')
tmp = pd.DataFrame(enc.fit_transform(df[['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                                         'NAME_HOUSING_TYPE']]).toarray())
tmp.columns = enc.get_feature_names(['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                                     'NAME_HOUSING_TYPE'])
tmp = tmp.set_index(df.index)
df = df.replace(['Y', 'F'], 1)
df = df.replace(['N', 'M'], 0)
df = df.join(tmp)

df = df.drop(columns=['OCCUPATION_TYPE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
                      'NAME_HOUSING_TYPE'])

In [4]:
x = df.drop(['score', 'score2'], axis=1)
y = df['score2']

train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3)

In [5]:
# documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedRandomForestClassifier.html
clf = BalancedRandomForestClassifier(n_estimators=5000)
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84     10855
           1       0.02      0.87      0.05        83

    accuracy                           0.73     10938
   macro avg       0.51      0.80      0.44     10938
weighted avg       0.99      0.73      0.84     10938



In [6]:
roc_auc_score(test_y, pred_y)

0.7971619319285432

In [None]:
# documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.EasyEnsembleClassifier.html#imblearn.ensemble.EasyEnsembleClassifier
clf2 = EasyEnsembleClassifier(n_estimators=1000)
clf2.fit(train_x, train_y)
pred2_y = clf2.predict(test_x)
print(classification_report(test_y, pred2_y))

In [None]:
roc_auc_score(test_y, pred2_y)

In [None]:
# documentation: https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.ensemble.BalancedBaggingClassifier.html#imblearn.ensemble.BalancedBaggingClassifier
# base estimator is 
clf3 = BalancedBaggingClassifier(n_estimators=5000)
clf3.fit(train_x, train_y)
pred3_y = clf3.predict(test_x)
print(classification_report(test_y, pred3_y))

In [None]:
roc_auc_score(test_y, pred3_y)