In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Load the dataset

In [28]:
df = pd.read_csv("../data/dataset.csv", sep=";")
df.head()

Unnamed: 0,uuid,default,account_amount_added_12_24m,account_days_in_dc_12_24m,account_days_in_rem_12_24m,account_days_in_term_12_24m,account_incoming_debt_vs_paid_0_24m,account_status,account_worst_status_0_3m,account_worst_status_12_24m,...,status_3rd_last_archived_0_24m,status_max_archived_0_6_months,status_max_archived_0_12_months,status_max_archived_0_24_months,recovery_debt,sum_capital_paid_account_0_12m,sum_capital_paid_account_12_24m,sum_paid_inv_0_12m,time_hours,worst_status_active_inv
0,63f69b2c-8b1c-4740-b78d-52ed9a4515ac,0.0,0,0.0,0.0,0.0,0.0,1.0,1.0,,...,1,1,1,1,0,0,0,178839,9.653333,1.0
1,0e961183-8c15-4470-9a5e-07a1bd207661,0.0,0,0.0,0.0,0.0,,1.0,1.0,1.0,...,1,1,2,2,0,0,0,49014,13.181389,
2,d8edaae6-4368-44e0-941e-8328f203e64e,0.0,0,0.0,0.0,0.0,,,,,...,1,1,2,2,0,0,0,124839,11.561944,1.0
3,0095dfb6-a886-4e2a-b056-15ef45fdb0ef,0.0,0,,,,,,,,...,1,1,1,1,0,0,0,324676,15.751111,1.0
4,c8f8b835-5647-4506-bf15-49105d8af30b,0.0,0,0.0,0.0,0.0,,,,,...,0,1,1,1,0,0,0,7100,12.698611,


In [29]:
# TODO : Undersample maj and oversample min

df.default.value_counts(normalize=True)

0.0    0.985685
1.0    0.014315
Name: default, dtype: float64

In [30]:
df.duplicated().sum()

0

In [31]:
df.isnull().sum().sort_values(ascending=False) / len(df)

worst_status_active_inv                0.695317
account_worst_status_12_24m            0.667770
account_worst_status_6_12m             0.603645
account_incoming_debt_vs_paid_0_24m    0.593292
account_worst_status_3_6m              0.577159
account_status                         0.543861
account_worst_status_0_3m              0.543861
avg_payment_span_0_3m                  0.493168
avg_payment_span_0_12m                 0.238417
num_active_div_by_paid_inv_0_12m       0.229445
num_arch_written_off_12_24m            0.180823
num_arch_written_off_0_12m             0.180823
account_days_in_rem_12_24m             0.118388
account_days_in_term_12_24m            0.118388
account_days_in_dc_12_24m              0.118388
default                                0.100024
sum_paid_inv_0_12m                     0.000000
sum_capital_paid_account_12_24m        0.000000
sum_capital_paid_account_0_12m         0.000000
recovery_debt                          0.000000
status_max_archived_0_24_months        0

In [91]:
test_df = df[df.default.isna()]

In [36]:
df.columns[df.isnull().any()]

Index(['default', 'account_days_in_dc_12_24m', 'account_days_in_rem_12_24m',
       'account_days_in_term_12_24m', 'account_incoming_debt_vs_paid_0_24m',
       'account_status', 'account_worst_status_0_3m',
       'account_worst_status_12_24m', 'account_worst_status_3_6m',
       'account_worst_status_6_12m', 'avg_payment_span_0_12m',
       'avg_payment_span_0_3m', 'num_active_div_by_paid_inv_0_12m',
       'num_arch_written_off_0_12m', 'num_arch_written_off_12_24m',
       'worst_status_active_inv'],
      dtype='object')

# Baseline

## Handling missing data

### categorical

In [37]:
cat_null_columns = ['account_status', 'account_worst_status_0_3m',
       'account_worst_status_12_24m', 'account_worst_status_3_6m',
       'account_worst_status_6_12m', 'worst_status_active_inv']

In [38]:
df.dropna(subset=["default"], inplace=True)

In [39]:
df[cat_null_columns] = df[cat_null_columns].fillna(0)

In [40]:
df[cat_null_columns].isnull().sum().sort_values(ascending=False) / len(df)

account_status                 0.0
account_worst_status_0_3m      0.0
account_worst_status_12_24m    0.0
account_worst_status_3_6m      0.0
account_worst_status_6_12m     0.0
worst_status_active_inv        0.0
dtype: float64

### numerical

In [41]:
num_null_col = df.columns[df.isnull().any()]
num_null_col

Index(['account_days_in_dc_12_24m', 'account_days_in_rem_12_24m',
       'account_days_in_term_12_24m', 'account_incoming_debt_vs_paid_0_24m',
       'avg_payment_span_0_12m', 'avg_payment_span_0_3m',
       'num_active_div_by_paid_inv_0_12m', 'num_arch_written_off_0_12m',
       'num_arch_written_off_12_24m'],
      dtype='object')

In [42]:
df[num_null_col].isnull().sum().sort_values(ascending=False) / len(df)

account_incoming_debt_vs_paid_0_24m    0.593014
avg_payment_span_0_3m                  0.493265
avg_payment_span_0_12m                 0.238597
num_active_div_by_paid_inv_0_12m       0.229595
num_arch_written_off_0_12m             0.181215
num_arch_written_off_12_24m            0.181215
account_days_in_dc_12_24m              0.118732
account_days_in_rem_12_24m             0.118732
account_days_in_term_12_24m            0.118732
dtype: float64

In [43]:
df.drop(columns=["account_incoming_debt_vs_paid_0_24m", "avg_payment_span_0_3m", "avg_payment_span_0_12m"], inplace=True)

In [44]:
num_null_col = df.columns[df.isnull().any()]
df[num_null_col].isnull().sum().sort_values(ascending=False) / len(df)

num_active_div_by_paid_inv_0_12m    0.229595
num_arch_written_off_0_12m          0.181215
num_arch_written_off_12_24m         0.181215
account_days_in_dc_12_24m           0.118732
account_days_in_rem_12_24m          0.118732
account_days_in_term_12_24m         0.118732
dtype: float64

In [45]:
# Impute median

imputer = SimpleImputer(strategy="median")
df[num_null_col] = imputer.fit_transform(df[num_null_col])

In [46]:
df.set_index("uuid", inplace=True)

## Feature Encoding

In [47]:
cat_col = df.select_dtypes("object").columns
cat_col

Index(['merchant_category', 'merchant_group', 'name_in_email'], dtype='object')

In [48]:
encoder = OneHotEncoder(sparse_output=False, drop="if_binary").set_output(transform="pandas")
encoder.fit(df[cat_col])
df[encoder.get_feature_names_out()] = encoder.transform(df[cat_col])

In [49]:
df.drop(columns=cat_col, inplace=True)

## Train test split

In [50]:
X = df.drop(columns=["default"])
y = df["default"]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
X_train.shape, X_test.shape, y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

((62983, 112),
 (26993, 112),
 0.0    0.985679
 1.0    0.014321
 Name: default, dtype: float64,
 0.0    0.9857
 1.0    0.0143
 Name: default, dtype: float64)

## Feature Scaling

In [52]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Balancing

In [53]:
round(y_train.value_counts(normalize=True)*100, 2)

0.0    98.57
1.0     1.43
Name: default, dtype: float64

In [54]:
y_train.value_counts()

0.0    62081
1.0      902
Name: default, dtype: int64

In [55]:
under = RandomUnderSampler(sampling_strategy=0.2)

In [56]:
X_train_scaled, y_train = under.fit_resample(X_train_scaled, y_train)

In [57]:
over = SMOTE(sampling_strategy=0.4)
X_train_scaled, y_train = over.fit_resample(X_train_scaled, y_train)

In [58]:
y_train.value_counts(normalize=True)

0.0    0.714286
1.0    0.285714
Name: default, dtype: float64

## Estimator

In [59]:
model = RidgeClassifier(class_weight="balanced")
cross_val_score(model, X_train_scaled, y_train, scoring="recall").mean()

0.8054493690366267

In [92]:
model.fit(X_train_scaled, y_train)

In [95]:
y_pred = model.predict(X_test_scaled)

In [97]:
from sklearn.metrics import recall_score

In [98]:
recall_score(y_test, y_pred)

0.49740932642487046

In [100]:
cross_val_score(model, X_test_scaled, y_test, scoring="recall")

array([0.        , 0.05194805, 0.06410256, 0.06493506, 0.1038961 ])

## Grid Search

In [74]:
models = [
    RidgeClassifier(class_weight="balanced"),
    KNeighborsClassifier(),
    # SVC(),
    LogisticRegression(max_iter=10000),
    xgb.XGBClassifier(),
    GradientBoostingClassifier()
]

scores = {
    "model_name" : [],
    "cv_score": []
}

for model in models:
    scores["model_name"].append(str(model).split("(")[0])
    scores["cv_score"].append(cross_val_score(model, X_train_scaled, y_train, scoring="recall").mean())

pd.DataFrame(scores).sort_values(by="cv_score", ascending=False)

Unnamed: 0,model_name,cv_score
0,RidgeClassifier,0.805449
1,KNeighborsClassifier,0.752236
3,XGBClassifier,0.745671
4,GradientBoostingClassifier,0.689674
2,LogisticRegression,0.636365


### LogisticRegression

In [310]:
model = LogisticRegression(max_iter=100000, solver='liblinear')

params = {
    "C": np.logspace(-3,3,7),
    "penalty": ["l1", "l2"]
}

search = GridSearchCV(model, param_grid=params, scoring="recall", verbose=2, n_jobs=-1)
search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] END ................................C=0.001, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001, penalty=l2; total time=   0.1s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END .................................C=0.01, penalty=l1; total time=   0.0s
[CV] END ................................C=0.001

In [311]:
search.best_params_, search.best_score_

({'C': 0.001, 'penalty': 'l1'}, 0.7494752231455832)

### KNN

In [312]:
model = KNeighborsClassifier()

params = {
    "n_neighbors": list(range(2,10))
}

search = GridSearchCV(model, param_grid=params, scoring="recall", verbose=2, n_jobs=-1)
search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=2; total time=   0.3s[CV] END ......................................n_neighbors=3; total time=   0.2s

[CV] END ......................................n_neighbors=3; total time=   0.1s
[CV] END ......................................n_neighbors=2; total time=   0.3s
[CV] END ......................................n_neighbors=3; total time=   0.2s
[CV] END ......................................n_neighbors=4; total time=   0.1s
[CV] END ......................................n_

In [314]:
search.best_params_, search.best_score_

({'n_neighbors': 3}, 0.7905247768544168)

### RidgeClassifier

In [66]:
np.logspace(-3,3,7)

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [71]:
model = RidgeClassifier(class_weight="balanced")

params={
    "alpha": np.logspace(-3,3,7)
}

search = GridSearchCV(model, param_grid=params, scoring="recall", verbose=2, n_jobs=-1)
search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ........................................alpha=0.001; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END .........................................alpha=0.01; total time=   0.0s
[CV] END ........................................

In [72]:
search.best_params_, search.best_score_

({'alpha': 100.0}, 0.815981840566328)

### XGBoostClassifier

In [324]:
model = xgb.XGBClassifier()

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

search = GridSearchCV(model, param_grid=params, scoring="recall", verbose=2, n_jobs=-1)
search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.8; total time=   2.8s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.6; total time=   2.9s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.6; total time=   2.9s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.6; total time=   3.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.8; total time=   3.0s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.6; total time=   3.1s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.8; total time=   3.1s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child_weight=1, subsample=0.6; total time=   3.1s
[CV] END colsample_bytree=0.6, gamma=0.5, max_depth=3, min_child

In [325]:
search.best_params_, search.best_score_

({'colsample_bytree': 1.0,
  'gamma': 1.5,
  'max_depth': 5,
  'min_child_weight': 1,
  'subsample': 0.8},
 0.741244998461065)

### GradientBoostingClassifier

In [327]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 100, 500, 1000]
    }

In [328]:
model = GradientBoostingClassifier()

params = {
    "learning_rate": parameters["learning_rate"],
    "max_depth": parameters["max_depth"],
    "n_estimators": parameters["n_estimators"]
        }

search = GridSearchCV(model, param_grid=params, scoring="recall", verbose=2, n_jobs=-1)
search.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 84 candidates, totalling 420 fits
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time=   0.5s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time=   0.5s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time=   0.4s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time=   0.4s
[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=10; total time=   0.5s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=500; total time=  26.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_e

In [329]:
search.best_params_, search.best_score_

({'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 500},
 0.7473437980917205)

## Pipeline

In [88]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector

In [83]:
cat_null_transformer = SimpleImputer(strategy="constant", fill_value=0)
cat_null_transformer

In [84]:
num_null_transformer = SimpleImputer(strategy="median")
num_null_transformer

In [87]:
null_preprocessor = make_column_transformer(
    (cat_null_transformer, cat_null_columns),
    (num_null_transformer, num_null_col),
    remainder="passthrough"
)
null_preprocessor

In [90]:
cat_col_selector = make_column_selector(dtype_include="object")
cat_transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False, drop="if_binary"), cat_col_selector)
)
cat_transformer