In [1]:
import os

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("display.float_format", lambda x: f'{x:.4f}')

import warnings
warnings.filterwarnings("ignore")

import numpy as np

In [2]:
def std_col_names(df):
    """
    - Convert feature names to lower case
    """
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df


In [3]:
# Load DF
df = std_col_names(pd.read_csv('./data/train.csv'))
df.head(5)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,CST_115179,ita Bose,46,F,N,Y,0.0,107934.04,612.0,Unknown,1.0,1.0,33070.28,18690.93,73,544.0,2,1,1
1,CST_121920,Alper Jonathan,29,M,N,Y,0.0,109862.62,2771.0,Laborers,2.0,0.0,15329.53,37745.19,52,857.0,0,0,0
2,CST_109330,Umesh Desai,37,M,N,Y,0.0,230153.17,204.0,Laborers,2.0,0.0,48416.6,41598.36,43,650.0,0,0,0
3,CST_128288,Rie,39,F,N,Y,0.0,122325.82,11941.0,Core staff,2.0,0.0,22574.36,32627.76,20,754.0,0,0,0
4,CST_151355,McCool,46,M,Y,Y,0.0,387286.0,1459.0,Core staff,1.0,0.0,38282.95,52950.64,75,927.0,0,0,0


Number of missing values for each feature: 

    owns_car                547  
    no_of_children          774    
    no_of_days_employed     463  
    total_family_members     83  
    migrant_worker           87  
    yearly_debt_payments     95  
    credit_score              8  
    dtype: int64

In [4]:
train_df = df.copy()

In [5]:
init_len = train_df.shape[0]

train_df.dropna(subset=['yearly_debt_payments', 'credit_score'], inplace= True)
post_len = train_df.shape[0]

print(
f'Initial number of records: {init_len}\nNumber of records after dropping:{post_len}'
)

Initial number of records: 45528
Number of records after dropping:45425


In [6]:
train_df['owns_car'].value_counts()

N    29669
Y    15209
Name: owns_car, dtype: int64

In [7]:
train_df['owns_car'].fillna("N", inplace= True)
train_df.fillna(0, inplace= True)

In [8]:
no_null_check = train_df.columns[train_df.isnull().any()].to_list() == []
assert no_null_check == True

In [9]:
default_df = train_df[train_df['credit_card_default'] == 1]

credlim_occ = default_df[['occupation_type', 'credit_limit_used(%)']].groupby(by= 'occupation_type').min().sort_values(by= 'credit_limit_used(%)',ascending= False).rename(columns= {'credit_limit_used(%)': 'occ_credlim'})

credscore_occ = default_df[['occupation_type', 'credit_score']].groupby(by= 'occupation_type').max().sort_values(by= 'credit_score').rename(columns= {'credit_score': 'occ_credscore'})   

In [51]:
credlim_occ.head()

Unnamed: 0_level_0,occ_credlim
occupation_type,Unnamed: 1_level_1
IT staff,77
Secretaries,75
Waiters/barmen staff,71
Cooking staff,71
Security staff,71


In [54]:
credscore_occ.head()

Unnamed: 0_level_0,occ_credscore
occupation_type,Unnamed: 1_level_1
IT staff,569.0
HR staff,652.0
Secretaries,685.0
Realty agents,687.0
Private service staff,690.0


In [10]:
train_df = pd.merge(train_df, credlim_occ, how= 'outer', on= 'occupation_type')
train_df = pd.merge(train_df, credscore_occ, how= 'outer', on= 'occupation_type')

In [11]:
train_df.sample(5, random_state=42)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default,occ_credlim,occ_credscore
16256,CST_144540,Sam,37,M,N,N,0.0,98724.5,1296.0,Laborers,2.0,0.0,24303.25,12624.24,33,908.0,0,0,0,70,699.0
42371,CST_142468,Richard,36,M,Y,Y,0.0,184122.83,1419.0,Security staff,2.0,1.0,35713.34,20781.94,29,856.0,0,0,0,71,699.0
24659,CST_116396,Lawrence Hurley,23,F,N,Y,0.0,251246.29,1283.0,Core staff,1.0,0.0,25878.5,72383.22,58,783.0,0,0,0,70,699.0
43684,CST_151340,Blenkinsop,37,F,N,Y,1.0,115582.46,119.0,Cooking staff,3.0,0.0,34318.88,14142.47,34,782.0,0,0,0,71,697.0
24751,CST_120011,Elizabeth Pineau,24,M,Y,Y,0.0,244530.35,6913.0,Core staff,2.0,0.0,8790.6,32145.95,56,861.0,0,0,0,70,699.0


In [12]:
min_credlim_default = round(default_df['credit_limit_used(%)'].min(), 2)
print(f'Minimum credit limit at which customer has defaulted is : {min_credlim_default}')

max_credscore_default = round(default_df['credit_score'].max(), 2)
print(f'Max credit score for which customer has defaulted is : {max_credscore_default}')

Minimum credit limit at which customer has defaulted is : 70
Max credit score for which customer has defaulted is : 699.0


In [13]:
train_df['above_min_credlim_def'] = (train_df['credit_limit_used(%)']>=min_credlim_default).astype('int')
train_df['below_min_credscore_def'] = (train_df['credit_score']<=max_credscore_default).astype('int')

In [14]:
train_df['above_min_credlim_occ'] = (train_df['credit_limit_used(%)']>=train_df['occ_credlim']).astype('int')
train_df['below_min_credscore_occ'] = (train_df['credit_score']<=train_df['occ_credscore']).astype('int')

In [15]:
train_df['months_employed'] = train_df['no_of_days_employed'] / 30

train_df['credlim_to_income'] =train_df['credit_limit'] / train_df['net_yearly_income'] 

train_df['debt_to_income'] = train_df['yearly_debt_payments'] / train_df['net_yearly_income']


In [16]:
train_df['num_gender'] = train_df['gender'].map({'XNA':-1, 'F': 0, 'M': 1})

mapper_yn = {'Y': 1, 'N': 0}
train_df['num_car'] = train_df['owns_car'].map(mapper_yn, na_action= 'ignore')
train_df['num_house'] = train_df['owns_house'].map(mapper_yn)

In [17]:
train_df.columns

Index(['customer_id', 'name', 'age', 'gender', 'owns_car', 'owns_house',
       'no_of_children', 'net_yearly_income', 'no_of_days_employed',
       'occupation_type', 'total_family_members', 'migrant_worker',
       'yearly_debt_payments', 'credit_limit', 'credit_limit_used(%)',
       'credit_score', 'prev_defaults', 'default_in_last_6months',
       'credit_card_default', 'occ_credlim', 'occ_credscore',
       'above_min_credlim_def', 'below_min_credscore_def',
       'above_min_credlim_occ', 'below_min_credscore_occ', 'months_employed',
       'credlim_to_income', 'debt_to_income', 'num_gender', 'num_car',
       'num_house'],
      dtype='object')

In [18]:
cols = [
  'age', 'num_gender', 'num_car', 'num_house',
       'no_of_children', 'total_family_members', 'migrant_worker', 
    'prev_defaults', 'default_in_last_6months',
    'above_min_credlim_def', 'above_min_credlim_occ', 'below_min_credscore_def', 'below_min_credscore_occ', 'months_employed',
       'credlim_to_income', 'debt_to_income'
]

target = 'credit_card_default'

In [19]:
len(cols)

16

In [20]:
print(f'Number of features: {len(cols)}')

Number of features: 16


In [21]:
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(train_df[cols], train_df[target], n_neighbors= 5, random_state= 42, discrete_features= True)
mi_df = pd.DataFrame(mi, index= cols, columns=['mutual_info']).reset_index().sort_values(by='mutual_info', ascending= False)


In [22]:
mi_df

Unnamed: 0,index,mutual_info
14,credlim_to_income,0.2817
15,debt_to_income,0.2817
7,prev_defaults,0.1592
8,default_in_last_6months,0.1462
12,below_min_credscore_occ,0.1305
11,below_min_credscore_def,0.1297
10,above_min_credlim_occ,0.0908
9,above_min_credlim_def,0.0906
13,months_employed,0.0676
1,num_gender,0.0016


In [23]:
fin_cols= mi_df[mi_df['mutual_info'] >0.05]['index'].to_list()
fin_cols

['credlim_to_income',
 'debt_to_income',
 'prev_defaults',
 'default_in_last_6months',
 'below_min_credscore_occ',
 'below_min_credscore_def',
 'above_min_credlim_occ',
 'above_min_credlim_def',
 'months_employed']

In [24]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction import DictVectorizer

In [25]:
train_df.shape[0]

45425

In [26]:
train_df['credit_card_default'].value_counts()

0    41737
1     3688
Name: credit_card_default, dtype: int64

In [27]:
over = SMOTE(random_state= 42, sampling_strategy=0.33)
x_over, y_over = over.fit_resample(train_df[fin_cols], train_df['credit_card_default'])

In [28]:
under = RandomUnderSampler(random_state= 42, sampling_strategy= 0.5)
x_ou, y_ou = under.fit_resample(x_over, y_over)

In [29]:
train_dict = x_ou.to_dict(orient= 'records')
dv = DictVectorizer(sparse= False)
train_dv = dv.fit_transform(train_dict)


## Modelling

In [30]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [31]:
xgb_cv_params = {
    'max_depth': [3,4,5,6,7], 
    'min_child_weight': [0,1,2,3,4],
    'subsample': [0.8,0.9,1], 
    'max_delta_step': [0,2, 4, 6],
    'learning_rate': [0.05, 0.1, 0.25, 0.3, 0.4, 0.5, 0.75],
    'n_estimators' : [50, 75, 100, 150],
    'alpha': [0, 1, 3, 5],
    'lambda': [0, 1, 3, 5],
    'booster': ['gbtree', 'dart']
}   

# Parameters fixed
fix_params = {'objective': 'binary:logistic', 'eval_metric': 'aucpr', 'use_label_encoder': False, 'tree_method': 'gpu_hist', 'scale_pos_weight': 2}  

# Create RandomizedSearchCV object
csv = RandomizedSearchCV(xgb.XGBClassifier(**fix_params), xgb_cv_params, scoring = 'f1_macro', cv = 5, verbose=1, n_jobs= 4, n_iter=20, random_state=42)

In [32]:
# %%timeit
# csv.fit(x_ou, y_ou)

In [35]:
fin_params = {'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.5,
 'booster': 'dart',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'enable_categorical': False,
 'gamma': 0,
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.25,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 2,
 'monotone_constraints': '()',
 'n_estimators': 150,
 'n_jobs': 8,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 0,
 'reg_alpha': 1,
 'reg_lambda': 0,
 'scale_pos_weight': 2,
 'subsample': 1,
 'validate_parameters': 1,
 'eval_metric': 'aucpr',
 'lambda': 0,
 'alpha': 1}

In [36]:
# Train model on entire set
dtrain = xgb.DMatrix(x_ou, y_ou)
fin_xgb = xgb.train(fin_params, dtrain, num_boost_round = 200)

Parameters: { "enable_categorical", "n_estimators", "use_label_encoder" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




## Predicting on test set

In [37]:
# Load DF
test_df = std_col_names(pd.read_csv('./data/test.csv'))
test_df.head(5)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
0,CST_142525,Siva,52,F,Y,N,0.0,232640.53,998.0,Unknown,2.0,0.0,14406.73,26524.4,4,779.0,0,0
1,CST_129215,Scott,48,F,N,N,1.0,284396.79,1338.0,Unknown,3.0,0.0,57479.99,68998.72,70,806.0,0,0
2,CST_138443,Victoria,50,F,N,N,1.0,149419.28,1210.0,Unknown,3.0,0.0,21611.01,25187.8,71,528.0,2,0
3,CST_123812,John McCrank,30,F,N,N,1.0,160437.54,503.0,Laborers,2.0,1.0,28990.76,29179.39,9,815.0,0,0
4,CST_144450,Martinne,52,M,N,Y,0.0,233480.37,157.0,Laborers,2.0,1.0,54213.72,82331.82,82,613.0,1,1


In [42]:
# Find out missing column names
missing_cols = test_df.columns[test_df.isnull().any()].to_list()
print('Features with missing values: ')
missing_cols

# Evaluate count of missing values
missing_cols_count = test_df[missing_cols].isnull().sum()
print('Number of missing values for each feature: ')
missing_cols_count

Features with missing values: 
Number of missing values for each feature: 


owns_car                132
no_of_children          190
no_of_days_employed     105
total_family_members     31
migrant_worker           26
yearly_debt_payments     22
credit_score              3
dtype: int64

### Approach 1

In [None]:
['credlim_to_income',
 'debt_to_income',
 'prev_defaults',
 'default_in_last_6months',
 'below_min_credscore_occ',
 'below_min_credscore_def',
 'above_min_credlim_occ',
 'above_min_credlim_def',
 'months_employed']

In [47]:
test_df['credit_score'].fillna(700, inplace= True)
test_df['yearly_debt_payments'].fillna(train_df['yearly_debt_payments'].mean(), inplace= True)
test_df['months_employed'] = test_df['no_of_days_employed']/30
test_df['months_employed'].fillna(train_df['months_employed'].mode(), inplace= True)

In [62]:
test_df['credlim_to_income'] =test_df['credit_limit'] / test_df['net_yearly_income'] 
test_df['debt_to_income'] = test_df['yearly_debt_payments'] / test_df['net_yearly_income']


In [49]:
test_df['above_min_credlim_def'] = (test_df['credit_limit_used(%)']>=min_credlim_default).astype('int')
test_df['below_min_credscore_def'] = (test_df['credit_score']<=max_credscore_default).astype('int')



In [55]:
test_df = pd.merge(test_df, credlim_occ, how= 'outer', on= 'occupation_type')
test_df = pd.merge(test_df, credscore_occ, how= 'outer', on= 'occupation_type')

In [58]:
test_df.sample(7)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,months_employed,above_min_credlim_def,below_min_credscore_def,occ_credlim,occ_credscore
5851,CST_163891,Ablan,30,M,N,N,0.0,173474.14,1903.0,Sales staff,1.0,0.0,33776.06,24527.11,38,917.0,0,0,63.4333,0,0,70,699.0
4703,CST_114298,Lucia Mutikani,37,F,Y,Y,0.0,118182.12,2628.0,Laborers,2.0,1.0,28176.17,27030.51,80,914.0,0,0,87.6,1,0,70,699.0
4980,CST_142699,Karen,53,M,N,N,1.0,216892.11,125.0,Laborers,3.0,0.0,54873.97,37919.84,4,919.0,0,0,4.1667,0,0,70,699.0
954,CST_110831,Sinead Carew,30,F,Y,N,0.0,123142.78,365250.0,Unknown,1.0,0.0,18677.15,23864.81,45,919.0,0,0,12175.0,0,0,70,699.0
1964,CST_153375,Wills,42,F,N,Y,0.0,156829.02,365251.0,Unknown,1.0,0.0,24947.35,28077.16,86,947.0,0,0,12175.0333,1,0,70,699.0
9571,CST_149829,Marsh,36,M,Y,Y,2.0,272584.32,1651.0,Drivers,4.0,1.0,42028.13,32839.08,27,946.0,0,0,55.0333,0,0,70,699.0
6465,CST_105836,Phil Wahba,23,F,N,N,2.0,143947.79,2422.0,Sales staff,3.0,0.0,41501.12,27268.18,12,758.0,0,0,80.7333,0,0,70,699.0


In [60]:
test_df['above_min_credlim_occ'] = (test_df['credit_limit_used(%)']>=test_df['occ_credlim']).astype('int')
test_df['below_min_credscore_occ'] = (test_df['credit_score']<=test_df['occ_credscore']).astype('int')

In [64]:
test_df[fin_cols].head()

Unnamed: 0,credlim_to_income,debt_to_income,prev_defaults,default_in_last_6months,below_min_credscore_occ,below_min_credscore_def,above_min_credlim_occ,above_min_credlim_def,months_employed
0,0.114,0.0619,0,0,0,0,0,0,33.2667
1,0.2426,0.2021,0,0,0,0,1,1,44.6
2,0.1686,0.1446,2,0,1,1,1,1,40.3333
3,0.2013,0.3196,0,0,0,0,0,0,10.6333
4,0.2378,0.2587,0,0,0,0,1,1,15.0
