In [1]:
import os

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("display.float_format", lambda x: f'{x:.4f}')

import warnings
warnings.filterwarnings("ignore")

import numpy as np

In [2]:
def std_col_names(df):
    """
    - Convert feature names to lower case
    """
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df


In [3]:
# Load DF
df = std_col_names(pd.read_csv('./data/train.csv'))
df.head(5)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,CST_115179,ita Bose,46,F,N,Y,0.0,107934.04,612.0,Unknown,1.0,1.0,33070.28,18690.93,73,544.0,2,1,1
1,CST_121920,Alper Jonathan,29,M,N,Y,0.0,109862.62,2771.0,Laborers,2.0,0.0,15329.53,37745.19,52,857.0,0,0,0
2,CST_109330,Umesh Desai,37,M,N,Y,0.0,230153.17,204.0,Laborers,2.0,0.0,48416.6,41598.36,43,650.0,0,0,0
3,CST_128288,Rie,39,F,N,Y,0.0,122325.82,11941.0,Core staff,2.0,0.0,22574.36,32627.76,20,754.0,0,0,0
4,CST_151355,McCool,46,M,Y,Y,0.0,387286.0,1459.0,Core staff,1.0,0.0,38282.95,52950.64,75,927.0,0,0,0


Number of missing values for each feature: 

    owns_car                547  
    no_of_children          774    
    no_of_days_employed     463  
    total_family_members     83  
    migrant_worker           87  
    yearly_debt_payments     95  
    credit_score              8  
    dtype: int64

In [4]:
train_df = df.copy()

In [46]:
df.columns

Index(['customer_id', 'name', 'age', 'gender', 'owns_car', 'owns_house',
       'no_of_children', 'net_yearly_income', 'no_of_days_employed',
       'occupation_type', 'total_family_members', 'migrant_worker',
       'yearly_debt_payments', 'credit_limit', 'credit_limit_used(%)',
       'credit_score', 'prev_defaults', 'default_in_last_6months',
       'credit_card_default'],
      dtype='object')

In [5]:
train_df['owns_car'].fillna("N", inplace= True)
train_df.fillna(0, inplace= True)

In [6]:
no_null_check = train_df.columns[train_df.isnull().any()].to_list() == []
assert no_null_check == True

In [7]:
default_df = train_df[train_df['credit_card_default'] == 1]

credlim_occ = default_df[['occupation_type', 'credit_limit_used(%)']].groupby(by= 'occupation_type').min().sort_values(by= 'credit_limit_used(%)',ascending= False).rename(columns= {'credit_limit_used(%)': 'occ_credlim'})

credscore_occ = default_df[['occupation_type', 'credit_score']].groupby(by= 'occupation_type').max().sort_values(by= 'credit_score').rename(columns= {'credit_score': 'occ_credscore'})   

In [8]:
train_df = pd.merge(train_df, credlim_occ, how= 'outer', on= 'occupation_type')
train_df = pd.merge(train_df, credscore_occ, how= 'outer', on= 'occupation_type')

In [9]:
min_credlim_default = round(default_df['credit_limit_used(%)'].min(), 2)
print(f'Minimum credit limit at which customer has defaulted is : {min_credlim_default}')

max_credscore_default = round(default_df['credit_score'].max(), 2)
print(f'Max credit score for which customer has defaulted is : {max_credscore_default}')

Minimum credit limit at which customer has defaulted is : 70
Max credit score for which customer has defaulted is : 699.0


In [10]:
train_df['above_min_credlim_def'] = (train_df['credit_limit_used(%)']>=min_credlim_default).astype('int')
train_df['below_min_credscore_def'] = (train_df['credit_score']<=max_credscore_default).astype('int')

In [11]:
train_df['above_min_credlim_occ'] = (train_df['credit_limit_used(%)']>=train_df['occ_credlim']).astype('int')
train_df['below_min_credscore_occ'] = (train_df['credit_score']<=train_df['occ_credscore']).astype('int')

In [12]:
train_df['months_employed'] = train_df['no_of_days_employed'] / 30

train_df['credlim_to_income'] =train_df['credit_limit'] / train_df['net_yearly_income'] 

train_df['debt_to_income'] = train_df['yearly_debt_payments'] / train_df['net_yearly_income']


In [13]:
train_df['num_gender'] = train_df['gender'].map({'XNA':-1, 'F': 0, 'M': 1})

mapper_yn = {'Y': 1, 'N': 0}
train_df['num_car'] = train_df['owns_car'].map(mapper_yn, na_action= 'ignore')
train_df['num_house'] = train_df['owns_house'].map(mapper_yn)

In [14]:
cols = [
  'age', 'num_gender', 'num_car', 'num_house',
       'no_of_children', 'total_family_members', 'migrant_worker', 
    'prev_defaults', 'default_in_last_6months',
    'above_min_credlim_def', 'above_min_credlim_occ', 'below_min_credscore_def', 'below_min_credscore_occ', 'months_employed',
       'credlim_to_income', 'debt_to_income', 'credit_limit', 'yearly_debt_payments'
]

target = 'credit_card_default'

In [15]:
print(f'Number of features: {len(cols)}')

Number of features: 18


In [16]:
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(train_df[cols], train_df[target], n_neighbors= 5, random_state= 42, discrete_features= True)
mi_df = pd.DataFrame(mi, index= cols, columns=['mutual_info']).reset_index().sort_values(by='mutual_info', ascending= False)


In [17]:
mi_df

Unnamed: 0,index,mutual_info
14,credlim_to_income,0.2817
15,debt_to_income,0.2811
16,credit_limit,0.281
17,yearly_debt_payments,0.2801
7,prev_defaults,0.1591
8,default_in_last_6months,0.1463
12,below_min_credscore_occ,0.1305
11,below_min_credscore_def,0.1297
10,above_min_credlim_occ,0.0908
9,above_min_credlim_def,0.0907


In [18]:
fin_cols= mi_df[mi_df['mutual_info'] >0.05]['index'].to_list()
fin_cols

['credlim_to_income',
 'debt_to_income',
 'credit_limit',
 'yearly_debt_payments',
 'prev_defaults',
 'default_in_last_6months',
 'below_min_credscore_occ',
 'below_min_credscore_def',
 'above_min_credlim_occ',
 'above_min_credlim_def',
 'months_employed']

In [19]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction import DictVectorizer

In [20]:
over = SMOTE(random_state= 42, sampling_strategy=0.25)
x_over, y_over = over.fit_resample(train_df[cols], train_df['credit_card_default'])

under = RandomUnderSampler(random_state= 42, sampling_strategy= 0.5)
x_ou, y_ou = under.fit_resample(x_over, y_over)

## Modelling

In [21]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [22]:
train_dict = x_ou.to_dict(orient= 'records')
dv = DictVectorizer(sparse= False)
train_dv = dv.fit_transform(train_dict)
dtrain = xgb.DMatrix(train_dv, y_ou)


In [23]:
xgb_cv_params = {
    'max_depth': [4,5,6,7,8], 
    'min_child_weight': [0,1,2,3,4],
    'subsample': [0.75, 0.8,0.9,1], 
    'max_delta_step': [0,2, 4, 6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.25, 0.3, 0.4, 0.5, 0.75],
    'n_estimators' : [75, 100, 150, 200],
    'alpha': [0, 1, 2, 3, 4, 5],
    'lambda': [0, 1, 2, 3, 4, 5],
    'gamma': [0, 0.5, 1, 2],
    'sample_type': ['uniform', 'weighted'],
    'rate_drop': [0, 0.25, .5, .7, 1],
    'one_drop': [0, 1],
    'skip_drop': [0, 0.25, .5, .7, 1]
}   

# Parameters fixed
fix_params = {'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_label_encoder': False, 'tree_method': 'gpu_hist', 'scale_pos_weight': 2, 'booster': 'dart'}  

# Create RandomizedSearchCV object
csv = RandomizedSearchCV(xgb.XGBClassifier(**fix_params), xgb_cv_params, scoring = 'f1_macro', cv = 5, verbose=1, n_jobs= -1, n_iter=30, random_state=42)

In [24]:
# %%timeit
# # csv.fit(x_ou, y_ou)
# csv.fit(train_dv, y_ou)

In [25]:
fin_cols_params = {'subsample': 0.75,
 'skip_drop': 0,
 'sample_type': 'weighted',
 'rate_drop': 0,
 'one_drop': 0,
 'n_estimators': 75,
 'min_child_weight': 3,
 'max_depth': 5,
 'max_delta_step': 8,
 'learning_rate': 0.5,
 'lambda': 3,
 'gamma': 1,
 'alpha': 3}

In [26]:
# Train model on entire set
fin_xgb = xgb.train(fin_cols_params, dtrain, num_boost_round = 200)

Parameters: { "n_estimators", "one_drop", "rate_drop", "sample_type", "skip_drop" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




## Predicting on test set

In [27]:
# Load DF
test_df = std_col_names(pd.read_csv('./data/test.csv'))
test_df.head(5)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months
0,CST_142525,Siva,52,F,Y,N,0.0,232640.53,998.0,Unknown,2.0,0.0,14406.73,26524.4,4,779.0,0,0
1,CST_129215,Scott,48,F,N,N,1.0,284396.79,1338.0,Unknown,3.0,0.0,57479.99,68998.72,70,806.0,0,0
2,CST_138443,Victoria,50,F,N,N,1.0,149419.28,1210.0,Unknown,3.0,0.0,21611.01,25187.8,71,528.0,2,0
3,CST_123812,John McCrank,30,F,N,N,1.0,160437.54,503.0,Laborers,2.0,1.0,28990.76,29179.39,9,815.0,0,0
4,CST_144450,Martinne,52,M,N,Y,0.0,233480.37,157.0,Laborers,2.0,1.0,54213.72,82331.82,82,613.0,1,1


In [28]:
# Find out missing column names
missing_cols = test_df.columns[test_df.isnull().any()].to_list()
print('Features with missing values: ')
missing_cols

# Evaluate count of missing values
missing_cols_count = test_df[missing_cols].isnull().sum()
print('Number of missing values for each feature: ')
missing_cols_count

Features with missing values: 
Number of missing values for each feature: 


owns_car                132
no_of_children          190
no_of_days_employed     105
total_family_members     31
migrant_worker           26
yearly_debt_payments     22
credit_score              3
dtype: int64

In [29]:
test_df['credit_score'].fillna(700, inplace= True)
test_df['yearly_debt_payments'].fillna(train_df['yearly_debt_payments'].mean(), inplace= True)
test_df['months_employed'] = test_df['no_of_days_employed']/30

In [30]:
test_df['credlim_to_income'] =test_df['credit_limit'] / test_df['net_yearly_income'] 
test_df['debt_to_income'] = test_df['yearly_debt_payments'] / test_df['net_yearly_income']


In [31]:
test_df['above_min_credlim_def'] = (test_df['credit_limit_used(%)']>=min_credlim_default).astype('int')
test_df['below_min_credscore_def'] = (test_df['credit_score']<=max_credscore_default).astype('int')



In [32]:
test_df = pd.merge(test_df, credlim_occ, how= 'outer', on= 'occupation_type')
test_df = pd.merge(test_df, credscore_occ, how= 'outer', on= 'occupation_type')

In [33]:
test_df['above_min_credlim_occ'] = (test_df['credit_limit_used(%)']>=test_df['occ_credlim']).astype('int')
test_df['below_min_credscore_occ'] = (test_df['credit_score']<=test_df['occ_credscore']).astype('int')

In [34]:
test_df['num_gender'] = test_df['gender'].map({'XNA':-1, 'F': 0, 'M': 1})

mapper_yn = {'Y': 1, 'N': 0}
test_df['num_car'] = test_df['owns_car'].map(mapper_yn, na_action= 'ignore')
test_df['num_house'] = test_df['owns_house'].map(mapper_yn)

In [35]:
test_df['months_employed'].fillna(train_df['months_employed'].quantile(0.5), inplace= True)

In [36]:
test_df.fillna(0, inplace= True)

In [37]:
test_df[cols].isna().sum()

age                        0
num_gender                 0
num_car                    0
num_house                  0
no_of_children             0
total_family_members       0
migrant_worker             0
prev_defaults              0
default_in_last_6months    0
above_min_credlim_def      0
above_min_credlim_occ      0
below_min_credscore_def    0
below_min_credscore_occ    0
months_employed            0
credlim_to_income          0
debt_to_income             0
credit_limit               0
yearly_debt_payments       0
dtype: int64

In [38]:
test_dict = test_df[cols].to_dict(orient= 'records')
test_dv = dv.transform(test_dict)

In [39]:
test_dv.shape

(11383, 18)

In [47]:
dtest = xgb.DMatrix(test_dv)

In [52]:
def get_sub(arr, outfile= 'sub.csv'):
    """
    Convert predicted output to a dataframe and extract to local
    """
    out_df = pd.DataFrame({'customer_id': test_df['customer_id'], 'credit_card_default': arr})
    out_df.to_csv('./subs/'+outfile, index= False)
    return out_df

In [53]:
test_pred = fin_xgb.predict(dtest)

In [54]:
thresh = 0.72
test_pred[test_pred>thresh] = 1
test_pred[test_pred <= thresh] = 0
test_pred = test_pred.astype('int')

In [55]:
get_sub(test_pred, 'colmodel_72.csv')

Unnamed: 0,customer_id,credit_card_default
0,CST_142525,0
1,CST_129215,0
2,CST_138443,1
3,CST_156027,0
4,CST_114556,0
...,...,...
11378,CST_117188,0
11379,CST_120435,0
11380,CST_144525,0
11381,CST_144529,0
