In [3]:
import os

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("display.float_format", lambda x: f'{x:.3f}')

import warnings
warnings.filterwarnings("ignore")

import numpy as np

In [4]:
def std_col_names(df):
    """
    - Convert feature names to lower case
    """
    df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')
    return df


In [16]:
# Load DF
df = std_col_names(pd.read_csv('./data/train.csv'))
df.head(5)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,CST_115179,ita Bose,46,F,N,Y,0.0,107934.04,612.0,Unknown,1.0,1.0,33070.28,18690.93,73,544.0,2,1,1
1,CST_121920,Alper Jonathan,29,M,N,Y,0.0,109862.62,2771.0,Laborers,2.0,0.0,15329.53,37745.19,52,857.0,0,0,0
2,CST_109330,Umesh Desai,37,M,N,Y,0.0,230153.17,204.0,Laborers,2.0,0.0,48416.6,41598.36,43,650.0,0,0,0
3,CST_128288,Rie,39,F,N,Y,0.0,122325.82,11941.0,Core staff,2.0,0.0,22574.36,32627.76,20,754.0,0,0,0
4,CST_151355,McCool,46,M,Y,Y,0.0,387286.0,1459.0,Core staff,1.0,0.0,38282.95,52950.64,75,927.0,0,0,0


Number of missing values for each feature: 

    owns_car                547  
    no_of_children          774    
    no_of_days_employed     463  
    total_family_members     83  
    migrant_worker           87  
    yearly_debt_payments     95  
    credit_score              8  
    dtype: int64

In [17]:
train_df = df.copy()

In [18]:
init_len = train_df.shape[0]

train_df.dropna(subset=['yearly_debt_payments', 'credit_score'], inplace= True)
post_len = train_df.shape[0]

print(
f'Initial number of records: {init_len}\nNumber of records after dropping:{post_len}'
)

Initial number of records: 45528
Number of records after dropping:45425


In [19]:
train_df['owns_car'].value_counts()

N    29669
Y    15209
Name: owns_car, dtype: int64

In [26]:
train_df['owns_car'].fillna("N", inplace= True)
train_df.fillna(0, inplace= True)

In [29]:
no_null_check = train_df.columns[train_df.isnull().any()].to_list() == []
assert no_null_check == True

In [30]:
default_df = train_df[train_df['credit_card_default'] == 1]

credlim_occ = default_df[['occupation_type', 'credit_limit_used(%)']].groupby(by= 'occupation_type').min().sort_values(by= 'credit_limit_used(%)',ascending= False).rename(columns= {'credit_limit_used(%)': 'occ_credlim'})

credscore_occ = default_df[['occupation_type', 'credit_score']].groupby(by= 'occupation_type').max().sort_values(by= 'credit_score').rename(columns= {'credit_score': 'occ_credscore'})   

In [31]:
train_df = pd.merge(train_df, credlim_occ, how= 'outer', on= 'occupation_type')
train_df = pd.merge(train_df, credscore_occ, how= 'outer', on= 'occupation_type')

In [32]:
train_df.sample(5, random_state=42)

Unnamed: 0,customer_id,name,age,gender,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default,occ_credlim,occ_credscore
16256,CST_144540,Sam,37,M,N,N,0.0,98724.5,1296.0,Laborers,2.0,0.0,24303.25,12624.24,33,908.0,0,0,0,70,699.0
42371,CST_142468,Richard,36,M,Y,Y,0.0,184122.83,1419.0,Security staff,2.0,1.0,35713.34,20781.94,29,856.0,0,0,0,71,699.0
24659,CST_116396,Lawrence Hurley,23,F,N,Y,0.0,251246.29,1283.0,Core staff,1.0,0.0,25878.5,72383.22,58,783.0,0,0,0,70,699.0
43684,CST_151340,Blenkinsop,37,F,N,Y,1.0,115582.46,119.0,Cooking staff,3.0,0.0,34318.88,14142.47,34,782.0,0,0,0,71,697.0
24751,CST_120011,Elizabeth Pineau,24,M,Y,Y,0.0,244530.35,6913.0,Core staff,2.0,0.0,8790.6,32145.95,56,861.0,0,0,0,70,699.0


In [33]:
min_credlim_default = round(default_df['credit_limit_used(%)'].min(), 2)
print(f'Minimum credit limit at which customer has defaulted is : {min_credlim_default}')

max_credscore_default = round(default_df['credit_score'].max(), 2)
print(f'Max credit score for which customer has defaulted is : {max_credscore_default}')

Minimum credit limit at which customer has defaulted is : 70
Max credit score for which customer has defaulted is : 699.0


In [34]:
train_df['above_min_credlim_def'] = (train_df['credit_limit_used(%)']>=min_credlim_default).astype('int')
train_df['below_min_credscore_def'] = (train_df['credit_score']<=max_credscore_default).astype('int')

In [35]:
train_df['above_min_credlim_occ'] = (train_df['credit_limit_used(%)']>=train_df['occ_credlim']).astype('int')
train_df['below_min_credscore_occ'] = (train_df['credit_score']<=train_df['occ_credscore']).astype('int')

In [36]:
train_df['months_employed'] = train_df['no_of_days_employed'] / 30

train_df['credlim_to_income'] =train_df['credit_limit'] / train_df['net_yearly_income'] 

train_df['debt_to_income'] = train_df['yearly_debt_payments'] / train_df['net_yearly_income']


In [37]:
train_df['num_gender'] = train_df['gender'].map({'XNA':-1, 'F': 0, 'M': 1})

mapper_yn = {'Y': 1, 'N': 0}
train_df['num_car'] = train_df['owns_car'].map(mapper_yn, na_action= 'ignore')
train_df['num_house'] = train_df['owns_house'].map(mapper_yn)

In [39]:
train_df.columns

Index(['customer_id', 'name', 'age', 'gender', 'owns_car', 'owns_house',
       'no_of_children', 'net_yearly_income', 'no_of_days_employed',
       'occupation_type', 'total_family_members', 'migrant_worker',
       'yearly_debt_payments', 'credit_limit', 'credit_limit_used(%)',
       'credit_score', 'prev_defaults', 'default_in_last_6months',
       'credit_card_default', 'occ_credlim', 'occ_credscore',
       'above_min_credlim_def', 'below_min_credscore_def',
       'above_min_credlim_occ', 'below_min_credscore_occ', 'months_employed',
       'credlim_to_income', 'debt_to_income', 'num_gender', 'num_car',
       'num_house'],
      dtype='object')

In [40]:
cols = [
  'age', 'num_gender', 'num_car', 'num_house',
       'no_of_children', 'total_family_members', 'migrant_worker', 
    'prev_defaults', 'default_in_last_6months',
    'above_min_credlim_def', 'above_min_credlim_occ', 'below_min_credscore_def', 'below_min_credscore_occ', 'months_employed',
       'credlim_to_income', 'debt_to_income'
]

target = 'credit_card_default'

In [41]:
print(f'Number of features: {len(cols)}')

Number of features: 16


In [48]:
from sklearn.feature_selection import mutual_info_classif

mi = mutual_info_classif(train_df[cols], train_df[target], n_neighbors= 5, random_state= 42, discrete_features= True)
mi_df = pd.DataFrame(mi, index= cols, columns=['mutual_info']).reset_index().sort_values(by='mutual_info', ascending= False)


In [50]:
mi_df[mi_df['mutual_info'] >0]['index'].to_list()

['credlim_to_income',
 'debt_to_income',
 'prev_defaults',
 'default_in_last_6months',
 'below_min_credscore_occ',
 'below_min_credscore_def',
 'above_min_credlim_occ',
 'above_min_credlim_def',
 'months_employed',
 'num_gender',
 'migrant_worker',
 'no_of_children',
 'total_family_members',
 'age',
 'num_car',
 'num_house']