# Credit Card Defaulter
1) ID - Id of customer
2) Default - Is the person a loan defaulter
3) Student - Is the person a student
4) Balance - balance in his/her account
5) Income - His/Her income

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('credit_card_defaulter.csv')

In [4]:
df

Unnamed: 0.1,Unnamed: 0,default,student,balance,income
0,1,No,No,729.526495,44361.62507
1,2,No,Yes,817.180407,12106.13470
2,3,No,No,1073.549164,31767.13895
3,4,No,No,529.250605,35704.49394
4,5,No,No,785.655883,38463.49588
...,...,...,...,...,...
9995,9996,No,No,711.555020,52992.37891
9996,9997,No,No,757.962918,19660.72177
9997,9998,No,No,845.411989,58636.15698
9998,9999,No,No,1569.009053,36669.11236


In [5]:
df['default'].unique()

array(['No', 'Yes'], dtype=object)

In [6]:
df['student'].unique()

array(['No', 'Yes'], dtype=object)

In [7]:
df.default = df.default.replace(to_replace=['No', 'Yes'], value=[0, 1])

In [8]:
df.student = df.student.replace(to_replace=['No', 'Yes'], value=[0, 1])

In [9]:
df = df.drop('Unnamed: 0', axis=1)

In [10]:
df.head()

Unnamed: 0,default,student,balance,income
0,0,0,729.526495,44361.62507
1,0,1,817.180407,12106.1347
2,0,0,1073.549164,31767.13895
3,0,0,529.250605,35704.49394
4,0,0,785.655883,38463.49588


# Phik

In [11]:
import phik

In [12]:
df.phik_matrix()

interval columns not set, guessing: ['default', 'student', 'balance', 'income']


Unnamed: 0,default,student,balance,income
default,1.0,0.052353,0.71238,0.032708
student,0.052353,1.0,0.257984,0.972309
balance,0.71238,0.257984,1.0,0.168739
income,0.032708,0.972309,0.168739,1.0


In [13]:
df.phik_matrix()['default'].sort_values(ascending=False)

interval columns not set, guessing: ['default', 'student', 'balance', 'income']


default    1.000000
balance    0.712380
student    0.052353
income     0.032708
Name: default, dtype: float64

In [14]:
df.isna().sum()

default    0
student    0
balance    0
income     0
dtype: int64

In [15]:
df['default'].value_counts()

0    9667
1     333
Name: default, dtype: int64

In [15]:
df['default'].mean()

0.0333

# Training, validation and test sample

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
train, test = train_test_split(df,train_size=0.6,random_state=42,stratify=df['default'])

In [18]:
val, test = train_test_split(test,train_size=0.5,random_state=42,stratify=test['default'])

In [19]:
df.default.mean()

0.0333

# Human learning

In [20]:
train_full = pd.concat([train,val])

In [21]:
train_full.groupby('balance')['default'].agg(['count','mean'])

Unnamed: 0_level_0,count,mean
balance,Unnamed: 1_level_1,Unnamed: 2_level_1
0.000000,389,0.0
0.023816,1,0.0
1.611176,1,0.0
1.976692,1,0.0
2.843015,1,0.0
...,...,...
2415.316994,1,1.0
2461.506979,1,1.0
2502.684931,1,1.0
2578.469022,1,1.0


In [22]:
train_full['balance_group'] = pd.cut(train_full['balance'],[-float('inf'),250,500,750,1000,1250,1500,1750,2000,2250,2500,float('inf')])

In [23]:
train_full.groupby('balance_group')['default'].agg(['count','mean'])

Unnamed: 0_level_0,count,mean
balance_group,Unnamed: 1_level_1,Unnamed: 2_level_1
"(-inf, 250.0]",979,0.0
"(250.0, 500.0]",1080,0.0
"(500.0, 750.0]",1450,0.001379
"(750.0, 1000.0]",1590,0.001887
"(1000.0, 1250.0]",1309,0.012223
"(1250.0, 1500.0]",863,0.048667
"(1500.0, 1750.0]",452,0.143805
"(1750.0, 2000.0]",199,0.40201
"(2000.0, 2250.0]",63,0.746032
"(2250.0, 2500.0]",12,0.75


In [24]:
model = train_full.groupby('balance_group')['default'].mean().reset_index()

In [25]:
model = model.rename({'default':'score_balance'},axis=1)

In [26]:
train_full = train_full.merge(model,how='left',on='balance_group')

In [27]:
train_full.groupby('score_balance')['default'].agg(['count','mean'])

Unnamed: 0_level_0,count,mean
score_balance,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,2059,0.0
0.001379,1450,0.001379
0.001887,1590,0.001887
0.012223,1309,0.012223
0.048667,863,0.048667
0.143805,452,0.143805
0.40201,199,0.40201
0.746032,63,0.746032
0.75,12,0.75
1.0,3,1.0


In [28]:
train_full.head()

Unnamed: 0,default,student,balance,income,balance_group,score_balance
0,0,0,1491.479282,54835.77005,"(1250.0, 1500.0]",0.048667
1,0,0,813.200651,49477.51151,"(750.0, 1000.0]",0.001887
2,0,0,1406.947652,27667.83603,"(1250.0, 1500.0]",0.048667
3,0,0,252.106954,38494.65469,"(250.0, 500.0]",0.0
4,0,1,1276.667762,19073.10852,"(1250.0, 1500.0]",0.048667


In [29]:
train_full.isna().sum()

default          0
student          0
balance          0
income           0
balance_group    0
score_balance    0
dtype: int64

# Baseline

In [30]:
from sklearn.metrics import log_loss

In [31]:
train_full['score_mean'] = train_full['default'].mean()

In [32]:
log_loss(train_full['default'],train_full['score_mean'])

0.1462850230726558

In [33]:
log_loss(train_full['default'],train_full['score_balance'])

0.0817439427656515

In [34]:
train_full

Unnamed: 0,default,student,balance,income,balance_group,score_balance,score_mean
0,0,0,1491.479282,54835.77005,"(1250.0, 1500.0]",0.048667,0.033375
1,0,0,813.200651,49477.51151,"(750.0, 1000.0]",0.001887,0.033375
2,0,0,1406.947652,27667.83603,"(1250.0, 1500.0]",0.048667,0.033375
3,0,0,252.106954,38494.65469,"(250.0, 500.0]",0.000000,0.033375
4,0,1,1276.667762,19073.10852,"(1250.0, 1500.0]",0.048667,0.033375
...,...,...,...,...,...,...,...
7995,0,0,783.087084,36917.10317,"(750.0, 1000.0]",0.001887,0.033375
7996,0,0,194.554390,38794.14591,"(-inf, 250.0]",0.000000,0.033375
7997,0,1,743.415446,19610.17998,"(500.0, 750.0]",0.001379,0.033375
7998,0,0,364.773884,31857.18073,"(250.0, 500.0]",0.000000,0.033375


# Uplift

In [35]:
def uplift(df,score,pct):
    default_all = df['default'].sum()
    df = df.sort_values(score,ascending=False)
    default_found = df.head(round(len(df) * pct))['default'].sum()

    return (default_found / default_all) / pct

In [36]:
uplift(train_full,'score_balance',0.2)

4.606741573033708

In [37]:
uplift(train_full,'score_mean',0.2)

0.8614232209737828

In [38]:
test['balance_group'] = pd.cut(test['balance'],[-float('inf'),250,500,750,1000,1250,1500,1750,2000,2250,2500,float('inf')])

In [39]:
test = test.merge(model,how='left',on='balance_group')

In [40]:
test.head()

Unnamed: 0,default,student,balance,income,balance_group,score_balance
0,0,0,69.561798,49074.2476,"(-inf, 250.0]",0.0
1,0,0,536.214278,47956.52456,"(500.0, 750.0]",0.001379
2,0,0,717.66739,27956.27352,"(500.0, 750.0]",0.001379
3,0,0,530.547329,44627.56264,"(500.0, 750.0]",0.001379
4,0,0,998.39307,46051.92286,"(750.0, 1000.0]",0.001887


In [41]:
test.isna().sum()

default          0
student          0
balance          0
income           0
balance_group    0
score_balance    0
dtype: int64

In [42]:
log_loss(train_full['default'],train_full['score_balance'])

0.0817439427656515

In [43]:
log_loss(test['default'],test['score_balance'])

0.0762208202534586

In [46]:
uplift(train_full,'score_balance',0.2)

4.606741573033708

In [45]:
uplift(test,'score_balance',0.2)

4.621212121212121

# Add a student

In [48]:
model = train_full.groupby(['balance_group', 'student'])['default'].mean().reset_index()

In [49]:
model = model.rename({'default':'score_balance_student'},axis=1)

In [50]:
model

Unnamed: 0,balance_group,student,score_balance_student
0,"(-inf, 250.0]",0,0.0
1,"(-inf, 250.0]",1,0.0
2,"(250.0, 500.0]",0,0.0
3,"(250.0, 500.0]",1,0.0
4,"(500.0, 750.0]",0,0.001859
5,"(500.0, 750.0]",1,0.0
6,"(750.0, 1000.0]",0,0.002662
7,"(750.0, 1000.0]",1,0.0
8,"(1000.0, 1250.0]",0,0.015205
9,"(1000.0, 1250.0]",1,0.006608


In [51]:
train_full = train_full.merge(model,how='left',on=['balance_group', 'student'])

In [52]:
train_full.head()

Unnamed: 0,default,student,balance,income,balance_group,score_balance,score_mean,score_balance_student
0,0,0,1491.479282,54835.77005,"(1250.0, 1500.0]",0.048667,0.033375,0.052239
1,0,0,813.200651,49477.51151,"(750.0, 1000.0]",0.001887,0.033375,0.002662
2,0,0,1406.947652,27667.83603,"(1250.0, 1500.0]",0.048667,0.033375,0.052239
3,0,0,252.106954,38494.65469,"(250.0, 500.0]",0.0,0.033375,0.0
4,0,1,1276.667762,19073.10852,"(1250.0, 1500.0]",0.048667,0.033375,0.042813


In [53]:
test = test.merge(model,how='left',on=['balance_group', 'student'])

In [54]:
test.isna().sum()

default                  0
student                  0
balance                  0
income                   0
balance_group            0
score_balance            0
score_balance_student    0
dtype: int64

In [55]:
log_loss(train_full['default'],train_full['score_balance_student'])

0.08022205818331367

In [56]:
log_loss(test['default'],test['score_balance_student'])

0.0922737551077118

In [57]:
uplift(train_full,'score_balance_student',0.2)

4.606741573033708

In [58]:
uplift(test,'score_balance_student',0.2)

4.621212121212121

# Machine learning

In [59]:
from catboost import CatBoostClassifier

In [81]:
X = ['student','balance','income']

y = ['default']

In [82]:
params = {'verbose':100,
          'random_seed':42,
          'learning_rate':0.005}

In [83]:
model = CatBoostClassifier(**params)

In [84]:
model.fit(train[X],train[y],eval_set=(val[X],val[y]))

0:	learn: 0.6803857	test: 0.6806247	best: 0.6806247 (0)	total: 4.94ms	remaining: 4.93s
100:	learn: 0.1848119	test: 0.1882758	best: 0.1882758 (100)	total: 529ms	remaining: 4.71s
200:	learn: 0.1043676	test: 0.1081916	best: 0.1081916 (200)	total: 1.09s	remaining: 4.34s
300:	learn: 0.0866074	test: 0.0908129	best: 0.0908129 (300)	total: 1.65s	remaining: 3.84s
400:	learn: 0.0803082	test: 0.0856249	best: 0.0856249 (400)	total: 2.22s	remaining: 3.31s
500:	learn: 0.0770833	test: 0.0838703	best: 0.0838703 (500)	total: 2.79s	remaining: 2.78s
600:	learn: 0.0750613	test: 0.0832033	best: 0.0832015 (599)	total: 3.36s	remaining: 2.23s
700:	learn: 0.0737807	test: 0.0831076	best: 0.0830599 (652)	total: 3.92s	remaining: 1.67s
800:	learn: 0.0728527	test: 0.0830918	best: 0.0830599 (652)	total: 4.49s	remaining: 1.11s
900:	learn: 0.0720229	test: 0.0831897	best: 0.0830599 (652)	total: 5.05s	remaining: 555ms
999:	learn: 0.0712328	test: 0.0833193	best: 0.0830599 (652)	total: 5.61s	remaining: 0us

bestTest = 0.0

<catboost.core.CatBoostClassifier at 0x2ac4aacbeb0>

In [85]:
def print_metrics(df,score):
    print(log_loss(df['default'],df[score]))
    print(uplift(df,score,0.2))

In [86]:
test['score_cat_balance_student_income'] = model.predict_proba(test[X])[:,1]

In [87]:
print_metrics(test,'score_cat_balance_student_income')

0.07645542690782194
4.621212121212121
