In [1]:
from functools import reduce

import warnings
import cufflinks as cf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score,davies_bouldin_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.impute import SimpleImputer
from scikitplot.metrics import plot_roc,plot_ks_statistic
from sklearn.ensemble import RandomForestClassifier

cf.go_offline()
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
ruta='/home/asm/amv/credit_scoring/datos/GiveMeSomeCredit'
df=pd.read_csv(ruta+'/cs-training.csv',index_col=0)
df.reset_index(drop=True,inplace=True)
df.insert(0,'ID',df.index+1)
df.head()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
target=['SeriousDlqin2yrs']

um=['ID']

varc=['RevolvingUtilizationOfUnsecuredLines']

vard=['NumberOfTimes90DaysLate','NumberOfTime30-59DaysPastDueNotWorse',
      'NumberOfTime60-89DaysPastDueNotWorse','age']

df=df[um+target+varc+vard]

## k-means clustering

### Imputación y estandarización de datos

In [4]:
imp=SimpleImputer(strategy='median')
imp.fit(df[varc+vard])
sc=StandardScaler()
X=sc.fit_transform(imp.transform(df[varc+vard]))
X=pd.DataFrame(X,columns=varc+vard)
X[um+target]=df[um+target]
X.head()

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,NumberOfTimes90DaysLate,NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,age,ID,SeriousDlqin2yrs
0,-0.02115,-0.063793,0.376593,-0.057852,-0.49386,1,1
1,-0.020385,-0.063793,-0.100419,-0.057852,-0.832342,2,0
2,-0.021582,0.176056,0.138087,-0.057852,-0.967735,3,0
3,-0.023281,-0.063793,-0.100419,-0.057852,-1.509307,4,0
4,-0.020585,-0.063793,0.138087,-0.057852,-0.223074,5,0


### Elección del número de clusters

In [5]:
db_score=[]
for k in range(2,11):
    km=KMeans(n_clusters=k,random_state=1234)
    km.fit(X[varc+vard])
    db_score.append((k,davies_bouldin_score(X[varc+vard],km.labels_)))
db_score=pd.DataFrame(db_score,columns=['k','Davies-Bouldin Score'])
db_score

Unnamed: 0,k,Davies-Bouldin Score
0,2,0.040136
1,3,0.448335
2,4,0.521157
3,5,0.522159
4,6,0.547173
5,7,0.47932
6,8,0.50547
7,9,0.464584
8,10,0.505885


In [6]:
db_score.iplot(x='k',y='Davies-Bouldin Score',mode='markers+lines',title='Davies-Bouldin Score')

## Descripción de los clusters

In [7]:
k=2 # número optimo de clusters
km=KMeans(n_clusters=k,random_state=1234)
km.fit(X[varc+vard])
df['cluster']=km.labels_
df.head()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,NumberOfTimes90DaysLate,NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,age,cluster
0,1,1,0.766127,0,2,0,45,0
1,2,0,0.957151,0,0,0,40,0
2,3,0,0.65818,1,1,0,38,0
3,4,0,0.23381,0,0,0,30,0
4,5,0,0.907239,0,1,0,49,0


In [8]:
df['cluster'].value_counts(1) # proporción de cada cluster

cluster
0    0.998207
1    0.001793
Name: proportion, dtype: float64

In [9]:
pd.crosstab(df['cluster'],df['SeriousDlqin2yrs'],normalize='index') # porcentaje de buenos y malos por cluster

SeriousDlqin2yrs,0,1
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.934022,0.065978
1,0.453532,0.546468


In [10]:
df[target].value_counts(normalize=True) # porcentaje de buenos y malos en la población

SeriousDlqin2yrs
0                   0.93316
1                   0.06684
Name: proportion, dtype: float64

In [11]:
df[df['cluster']==0][varc+vard].describe().round(2) 
# estadísticas de las variables pora el cluster 0

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,NumberOfTimes90DaysLate,NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,age
count,149731.0,149731.0,149731.0,149731.0,149731.0
mean,6.06,0.09,0.25,0.06,52.33
std,249.98,0.49,0.7,0.33,14.75
min,0.0,0.0,0.0,0.0,0.0
25%,0.03,0.0,0.0,0.0,41.0
50%,0.15,0.0,0.0,0.0,52.0
75%,0.56,0.0,0.0,0.0,63.0
max,50708.0,17.0,13.0,11.0,109.0


In [12]:
df[df['cluster']==1][varc+vard].describe().round(2) 
# estadísticas de las variables pora el cluster 1

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,NumberOfTimes90DaysLate,NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,age
count,269.0,269.0,269.0,269.0,269.0
mean,1.0,97.96,97.96,97.96,34.25
std,0.0,0.27,0.27,0.27,13.06
min,1.0,96.0,96.0,96.0,21.0
25%,1.0,98.0,98.0,98.0,24.0
50%,1.0,98.0,98.0,98.0,29.0
75%,1.0,98.0,98.0,98.0,43.0
max,1.0,98.0,98.0,98.0,79.0


* cluster 0: Adultos mayores que han presentado pocos atrasos en el pago de sus créditos y tienen un RevolvingUtilizationOfUnsecuredLines bajo (tienen limite de credito alto)
* cluster 2: Personas jovenes que han presentado atrasos en el pago de sus créditos y tienen un RevolvingUtilizationOfUnsecuredLines de 1 (limite de credito igual al sado en su tarjeta de credito)) 

## Segmentación

In [13]:
s0=df[df['cluster']==0].reset_index(drop=True)
s1=df[df['cluster']==1].reset_index(drop=True)

### Modelo del segmento 0

In [14]:
train0,test0=train_test_split(s0,test_size=0.3,random_state=1234,stratify=s0[target])
train0.reset_index(drop=True,inplace=True)
test0.reset_index(drop=True,inplace=True)
train0.shape,test0.shape

((104811, 8), (44920, 8))

In [15]:
modelo0=RandomForestClassifier()
grid={'n_estimators':range(20,201,20),'max_depth':range(2,6),'criterion':['gini','entropy'],'class_weight':['balanced',None],'max_features':['sqrt','log2'],'n_jobs':[-1],'random_state':[1234]}
gs0=RandomizedSearchCV(modelo0,grid,n_iter=50,scoring='roc_auc',n_jobs=-1,random_state=1234)
gs0.fit(train0[varc+vard],np.ravel(train0[target]))
modelo0=gs0.best_estimator_
modelo0

In [16]:
roc_auc_score(train0[target],modelo0.predict_proba(train0[varc+vard])[:,1]),roc_auc_score(test0[target],modelo0.predict_proba(test0[varc+vard])[:,1])

(0.854689524426914, 0.8576704175919018)

### Modelo del segmento 1

In [17]:
train1, test1 = train_test_split(s1, test_size=0.3, random_state=1234, stratify=s1[target])
train1.reset_index(drop=True, inplace=True)
test1.reset_index(drop=True, inplace=True)
train1.shape, test1.shape

((188, 8), (81, 8))

In [18]:
modelo1=RandomForestClassifier()
gs1 = RandomizedSearchCV(modelo1, grid, n_iter=50, scoring='roc_auc', n_jobs=-1, random_state=1234)
gs1.fit(train1[varc+vard], np.ravel(train1[target]))
modelo1 = gs1.best_estimator_
modelo1

In [19]:
roc_auc_score(train1[target], modelo1.predict_proba(train1[varc+vard])[:, 1]),roc_auc_score(test1[target], modelo1.predict_proba(test1[varc+vard])[:, 1])

(0.7019988577955454, 0.7137592137592137)

## Aplicación de los modelos

In [20]:
s0[['proba_0','proba_1']]=modelo0.predict_proba(s0[varc+vard])
s1[['proba_0','proba_1']]=modelo1.predict_proba(s1[varc+vard])

In [21]:
tad=pd.concat([s0,s1],axis=0).reset_index(drop=True)
tad.head()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,NumberOfTimes90DaysLate,NumberOfTime30-59DaysPastDueNotWorse,NumberOfTime60-89DaysPastDueNotWorse,age,cluster,proba_0,proba_1
0,1,1,0.766127,0,2,0,45,0,0.784511,0.215489
1,2,0,0.957151,0,0,0,40,0,0.911253,0.088747
2,3,0,0.65818,1,1,0,38,0,0.629909,0.370091
3,4,0,0.23381,0,0,0,30,0,0.973538,0.026462
4,5,0,0.907239,0,1,0,49,0,0.826748,0.173252


In [22]:
roc_auc_score(tad[target],tad['proba_1'])

0.8570430552388557

In [23]:
pred=pd.read_csv(ruta+'/cs-test.csv')
pred['cluster']=km.predict(pred[varc+vard])
pred['Id']=pred.index+1
pred['Probability']=np.where(pred['cluster']==0,modelo0.predict_proba(pred[varc+vard])[:,1],modelo1.predict_proba(pred[varc+vard])[:,1])
pred[['Id','Probability']].to_csv(ruta+'/pred_segmentacion_kmeans.csv',index=False)
pred.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,cluster,Id,Probability
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0,0,1,0.08842
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0,0,2,0.032755
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0,0,3,0.011657
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0,0,4,0.075617
4,5,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0,0,5,0.090374
