In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

# Feature engineering
from sklearn.impute  import SimpleImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# sampling
from imblearn.under_sampling import TomekLinks 
from imblearn.combine import SMOTETomek

#Modelling
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix

#from imblearn.ensemble import BalancedRandomForestClassifier

## Common Functions

In [2]:
def calc_sum_capital_paid_account_0_24m(data):
    # add ne variable (sm_capital_paid_account_0_24m)the data
    # sm_capital_paid_account_0_24m = capital_paid_account_0_12m + m_capital_paid_account_12_24m
    data['sum_capital_paid_account_0_24m'] = data.apply(lambda row : row['sum_capital_paid_account_0_12m'] + row['sum_capital_paid_account_12_24m'], axis=1)
    return data

In [3]:
def calc_num_of_paid_inv_0_12m(data):
    # add ne variable (num_of_paid_inv_0_12m)the data
    # num_of_paid_inv_0_12m = num_active_inv / num_active_div_by_paid_inv_0_12m
    data['num_of_paid_inv_0_12m'] = data.apply(lambda row: 0 if row['num_active_inv'] == 0 else row['num_active_inv'] / row['num_active_div_by_paid_inv_0_12m'], axis=1)
    return data

In [4]:
def calc_status_max_active_0_24(data):
    # add ne variable (status_max_active_0_24)the data
    # status_max_active_0_24 is the max(account_worst_status_0_3m, account_worst_status_12_24m, account_worst_status_3_6m, account_worst_status_6_12m)
    data['status_max_active_0_24'] = data.apply(lambda row: max(row['account_worst_status_0_3m'], row['account_worst_status_12_24m'], row['account_worst_status_3_6m'], row['account_worst_status_6_12m']), axis=1)
    return data

## Load the Data

In [5]:
data = pd.read_csv("../data/training_dataset.csv")
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [6]:
data['sum_capital_paid_account_0_12m']

0        0
1        0
2        0
3        0
4        0
        ..
89971    0
89972    0
89973    0
89974    0
89975    0
Name: sum_capital_paid_account_0_12m, Length: 89976, dtype: int64

## Adding new features

In [7]:
calc_sum_capital_paid_account_0_24m(data)

In [8]:
calc_num_of_paid_inv_0_12m(data)

In [9]:
calc_status_max_active_0_24(data)

In [10]:
data.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv,sum_capital_paid_account_0_24m,num_of_paid_inv_0_12m,status_max_active_0_24
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,0,0,0,178839,9.653333,1.0,0,13.0,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,2,0,0,0,49014,13.181389,,0,0.0,1.0
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,2,0,0,0,124839,11.561944,1.0,0,14.0,
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,0,0,0,324676,15.751111,1.0,0,32.0,
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,1,0,0,0,7100,12.698611,,0,0.0,


## Removing unwanted features

In [11]:
unwanted_fets = ['uuid', 'age', 'name_in_email', 'merchant_group', 'time_hours', 'avg_payment_span_0_12m', 'avg_payment_span_0_3m'
                 , 'account_incoming_debt_vs_paid_0_24m', 'num_arch_written_off_0_12m', "num_arch_written_off_12_24m", "recovery_debt", 'account_days_in_dc_12_24m'
                 , 'account_days_in_term_12_24m', 'num_arch_dc_0_12m','num_arch_dc_12_24m' , 'num_active_inv', 'max_paid_inv_0_12m', 'sum_capital_paid_account_0_12m', 'sum_capital_paid_account_12_24m'
                 , 'account_worst_status_0_3m', 'account_worst_status_12_24m', 'account_worst_status_3_6m', 'account_worst_status_6_12m']
data.drop(columns=unwanted_fets, inplace=True)

In [12]:
data.head()

Unnamed: 0,default,account_amount_added_12_24m,account_days_in_rem_12_24m,account_status,merchant_category,has_paid,max_paid_inv_0_24m,num_active_div_by_paid_inv_0_12m,num_arch_ok_0_12m,num_arch_ok_12_24m,...,status_2nd_last_archived_0_24m,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,sum_paid_inv_0_12m,worst_status_active_inv,sum_capital_paid_account_0_24m,num_of_paid_inv_0_12m,status_max_active_0_24
0,0.0,0,0.0,1.0,Dietary supplements,True,31638.0,0.153846,13,14,...,1,1,1,1,1,178839,1.0,0,13.0,1.0
1,0.0,0,0.0,1.0,Books & Magazines,True,13749.0,0.0,9,19,...,1,1,1,2,2,49014,,0,0.0,1.0
2,0.0,0,0.0,,Diversified entertainment,True,29890.0,0.071429,11,0,...,1,1,1,2,2,124839,1.0,0,14.0,
3,0.0,0,,,Diversified entertainment,True,40040.0,0.03125,31,21,...,1,1,1,1,1,324676,1.0,0,32.0,
4,0.0,0,0.0,,Electronic equipment & Related accessories,True,7100.0,0.0,1,0,...,0,0,1,1,1,7100,,0,0.0,


In [13]:
len(data.columns)

23

## Split dataset into train and test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('default', axis=1), # predictors
    data.default, # target
    test_size=0.2,
    random_state=0)  # for reproducibility

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((71980, 22), (17996, 22), (71980,), (17996,))

## Feature engineering

In [15]:
# define list for both categorical and continuous/numerical features
con_fets = ['num_arch_ok_0_12m', 'account_amount_added_12_24m', 'account_days_in_rem_12_24m', 'max_paid_inv_0_24m', 'num_active_div_by_paid_inv_0_12m', 'num_arch_ok_12_24m', 'num_arch_rem_0_12m', 'num_unpaid_bills', 'sum_paid_inv_0_12m', 'sum_capital_paid_account_0_24m', 'num_of_paid_inv_0_12m']
cat_fets = list(set(data.columns) - set(con_fets) - set(['default', 'worst_status_active_inv', 'account_status', 'status_max_active_0_24']))
#, 'worst_status_active_inv', 'account_status', 'status_max_active_0_24' 

In [16]:
# check for missing values in our continuous/numerical features
X_train[con_fets].isnull().mean().sort_values(ascending=False)

num_active_div_by_paid_inv_0_12m    0.231425
account_days_in_rem_12_24m          0.118533
num_of_paid_inv_0_12m               0.021464
num_arch_ok_0_12m                   0.000000
account_amount_added_12_24m         0.000000
max_paid_inv_0_24m                  0.000000
num_arch_ok_12_24m                  0.000000
num_arch_rem_0_12m                  0.000000
num_unpaid_bills                    0.000000
sum_paid_inv_0_12m                  0.000000
sum_capital_paid_account_0_24m      0.000000
dtype: float64

In [17]:
# check for missing values in our categorical features
X_train[cat_fets].isnull().mean().sort_values(ascending=False)

# missing categorical data will be handeled by the model

status_max_archived_0_6_months     0.0
status_last_archived_0_24m         0.0
status_max_archived_0_24_months    0.0
status_3rd_last_archived_0_24m     0.0
merchant_category                  0.0
has_paid                           0.0
status_2nd_last_archived_0_24m     0.0
status_max_archived_0_12_months    0.0
dtype: float64

In [18]:
# Fill missing numerical values 
imputer = SimpleImputer(strategy='most_frequent')
X_train[con_fets] = imputer.fit_transform(X_train[con_fets])
X_test[con_fets] = imputer.transform(X_test[con_fets])

In [19]:
scaler = MinMaxScaler()
X_train[con_fets] = scaler.fit_transform(X_train[con_fets])
X_test[con_fets] = scaler.transform(X_test[con_fets])

In [20]:
# encode merchant_category

ordinal_enc = OneHotEncoder()
X_train = ordinal_enc.fit_transform(X_train)
X_test = ordinal_enc.transform(X_test)

In [21]:
X_train.columns

Index(['account_amount_added_12_24m', 'account_days_in_rem_12_24m',
       'account_status', 'has_paid', 'max_paid_inv_0_24m',
       'num_active_div_by_paid_inv_0_12m', 'num_arch_ok_0_12m',
       'num_arch_ok_12_24m', 'num_arch_rem_0_12m', 'num_unpaid_bills',
       'status_last_archived_0_24m', 'status_2nd_last_archived_0_24m',
       'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
       'status_max_archived_0_12_months', 'status_max_archived_0_24_months',
       'sum_paid_inv_0_12m', 'worst_status_active_inv',
       'sum_capital_paid_account_0_24m', 'num_of_paid_inv_0_12m',
       'status_max_active_0_24', 'merchant_category_Diversified entertainment',
       'merchant_category_Diversified electronics',
       'merchant_category_Automotive Parts & Accessories',
       'merchant_category_Personal care & Body improvement',
       'merchant_category_Youthful Shoes & Clothing',
       'merchant_category_Books & Magazines',
       'merchant_category_Concept stores 

## Undersampling

In [23]:
tl = SMOTETomek()#TomekLinks(sampling_strategy='not majority')
X_res, y_res = tl.fit_resample(X_train.fillna(0), y_train)


In [24]:
X_res.shape, X_train.shape

((141784, 78), (71980, 78))

## Modelling

### Baseline model

In [25]:
xgb_model = XGBClassifier(learning_rate=0.001,
                            max_depth = 1, 
                            n_estimators = 100)
xgb_model.fit(X_res, y_res)
#scale_pos_weight=0                              





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.001, max_delta_step=0,
              max_depth=1, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
xgb_predict = xgb_model.predict(X_test)
confusion_mat = confusion_matrix(y_test,xgb_predict)
print("confusion_matrix ", confusion_mat)
class_report = classification_report(y_test,xgb_predict)
print("classification_report", class_report)
print("Accuracy per class", 100 * confusion_mat.diagonal()/confusion_mat.sum(axis=1))

confusion_matrix  [[10629  7104]
 [   54   209]]
classification_report               precision    recall  f1-score   support

         0.0       0.99      0.60      0.75     17733
         1.0       0.03      0.79      0.06       263

    accuracy                           0.60     17996
   macro avg       0.51      0.70      0.40     17996
weighted avg       0.98      0.60      0.74     17996

Accuracy per class [59.9390966  79.46768061]


In [27]:
"""
from numpy import mean
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier


# define model
model = XGBClassifier()
# define grid
weights = [1, 10, 25, 50, 75, 99, 100, 1000]
param_grid = dict(scale_pos_weight=weights)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid search
scoring=['accuracy','precision_macro','recall_macro']
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='precision_macro')
# execute the grid search
grid_result = grid.fit(X_train, y_train)
# report the best configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# report all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
"""

'\nfrom numpy import mean\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.model_selection import RepeatedStratifiedKFold\nfrom xgboost import XGBClassifier\n\n\n# define model\nmodel = XGBClassifier()\n# define grid\nweights = [1, 10, 25, 50, 75, 99, 100, 1000]\nparam_grid = dict(scale_pos_weight=weights)\n# define evaluation procedure\ncv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)\n# define grid search\nscoring=[\'accuracy\',\'precision_macro\',\'recall_macro\']\ngrid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring=\'precision_macro\')\n# execute the grid search\ngrid_result = grid.fit(X_train, y_train)\n# report the best configuration\nprint("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))\n# report all configurations\nmeans = grid_result.cv_results_[\'mean_test_score\']\nstds = grid_result.cv_results_[\'std_test_score\']\nparams = grid_result.cv_results_[\'params\']\nfor mean, stdev, 

In [28]:
xgb_grid_predict = grid.predict(X_test)
confusion_mat = confusion_matrix(y_test,xgb_grid_predict)
print("confusion_matrix ", confusion_mat)
class_report = classification_report(y_test,xgb_grid_predict)
print("classification_report", class_report)
print("Accuracy per class", 100 * confusion_mat.diagonal()/confusion_mat.sum(axis=1))

NameError: name 'grid' is not defined

In [31]:
# easy ensemble for imbalanced classification
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators=10, class_weight='balanced_subsample')
# balanced_subsample
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train.fillna(0), y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

Mean ROC AUC: 0.729


In [32]:

model.fit(X_res, y_res)
xgb_grid_predict = model.predict(X_test.fillna(0))
confusion_mat = confusion_matrix(y_test,xgb_grid_predict)
print("confusion_matrix ", confusion_mat)
class_report = classification_report(y_test,xgb_grid_predict)
print("classification_report", class_report)
print("Accuracy per class", 100 * confusion_mat.diagonal()/confusion_mat.sum(axis=1))

confusion_matrix  [[16375  1358]
 [  194    69]]
classification_report               precision    recall  f1-score   support

         0.0       0.99      0.92      0.95     17733
         1.0       0.05      0.26      0.08       263

    accuracy                           0.91     17996
   macro avg       0.52      0.59      0.52     17996
weighted avg       0.97      0.91      0.94     17996

Accuracy per class [92.34196132 26.23574144]
