In [1]:
from functools import reduce

import cufflinks as cf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

sns.set()
cf.go_offline()
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [2]:
ruta='/home/asm/amv/credit_scoring/datos/GiveMeSomeCredit'

In [3]:
df=pd.read_csv(ruta+'/cs-training.csv',index_col=0)
df.reset_index(drop=True,inplace=True)
df.insert(0,'ID',df.index+1)
df.set_index('ID',inplace=True)
df.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
target=['SeriousDlqin2yrs']

varc=['RevolvingUtilizationOfUnsecuredLines','DebtRatio',
      'MonthlyIncome']

vard=['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate',
      'NumberRealEstateLoansOrLines','NumberOfTime60-89DaysPastDueNotWorse','NumberOfDependents',
      'age','NumberOfOpenCreditLinesAndLoans']

In [5]:
assert len(varc)+len(vard)+len(target)==len(df.columns)

In [6]:
train,test=train_test_split(df,test_size=0.3,random_state=24,stratify=df[target]) 
train.shape,test.shape   

((105000, 11), (45000, 11))

In [7]:
train[varc+vard].describe(percentiles=[.01,.05,.1,.25,.5,.75,.9,.95,.99]).round(2)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,DebtRatio,MonthlyIncome,NumberOfTime30-59DaysPastDueNotWorse,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,age,NumberOfOpenCreditLinesAndLoans
count,105000.0,105000.0,84322.0,105000.0,105000.0,105000.0,105000.0,102278.0,105000.0,105000.0
mean,6.09,349.42,6661.11,0.41,0.26,1.02,0.23,0.76,52.3,8.45
std,270.81,2168.13,12302.88,4.1,4.08,1.13,4.06,1.12,14.78,5.15
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0
5%,0.0,0.0,1300.0,0.0,0.0,0.0,0.0,0.0,29.0,2.0
10%,0.0,0.03,2009.1,0.0,0.0,0.0,0.0,0.0,33.0,3.0
25%,0.03,0.17,3400.0,0.0,0.0,0.0,0.0,0.0,41.0,5.0
50%,0.15,0.37,5400.0,0.0,0.0,1.0,0.0,0.0,52.0,8.0
75%,0.56,0.86,8253.25,0.0,0.0,2.0,0.0,1.0,63.0,11.0


In [8]:
def outliers(df:pd.DataFrame,lower:float=.005,upper:float=.995)->tuple:
    X=df.copy()
    l,u=X.quantile([lower]).values,X.quantile([upper]).values
    X['extremo']=np.where((X>u).sum(axis=1)+(X<l).sum(axis=1)==0,0,1)
    no_outliers=X[X['extremo']==0].drop('extremo',axis=1)
    return no_outliers

In [9]:
train_=outliers(train)
train_.describe(percentiles=[.01,.05,.10,.25,.50,.75,.90,.95,.99]).round(2)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,100967.0,100967.0,100967.0,100967.0,100967.0,81459.0,100967.0,100967.0,100967.0,100967.0,98434.0
mean,0.06,0.31,52.3,0.23,296.72,6285.21,8.35,0.07,0.99,0.05,0.75
std,0.24,0.35,14.51,0.63,860.24,4240.89,4.81,0.36,0.99,0.25,1.09
min,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,0.0,0.0,29.0,0.0,0.01,1333.0,2.0,0.0,0.0,0.0,0.0
10%,0.0,0.0,33.0,0.0,0.04,2080.0,3.0,0.0,0.0,0.0,0.0
25%,0.0,0.03,41.0,0.0,0.18,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.15,52.0,0.0,0.36,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.54,63.0,0.0,0.83,8168.0,11.0,0.0,2.0,0.0,1.0


In [10]:
imp_c=SimpleImputer(strategy='mean')
imp_d=SimpleImputer(strategy='most_frequent')
imp_c.fit(train_[varc]),imp_d.fit(train_[vard])

(SimpleImputer(), SimpleImputer(strategy='most_frequent'))

In [11]:
train_[varc]=imp_c.transform(train_[varc])
train_[vard]=imp_d.transform(train_[vard])
train_.describe(percentiles=[.01,.05,.10,.25,.50,.75,.90,.95,.99]).round(2)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,100967.0,100967.0,100967.0,100967.0,100967.0,100967.0,100967.0,100967.0,100967.0,100967.0,100967.0
mean,0.06,0.31,52.3,0.23,296.72,6285.21,8.35,0.07,0.99,0.05,0.73
std,0.24,0.35,14.51,0.63,860.24,3809.22,4.81,0.36,0.99,0.25,1.08
min,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,0.0,0.0,29.0,0.0,0.01,1545.0,2.0,0.0,0.0,0.0,0.0
10%,0.0,0.0,33.0,0.0,0.04,2348.0,3.0,0.0,0.0,0.0,0.0
25%,0.0,0.03,41.0,0.0,0.18,3900.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.15,52.0,0.0,0.36,6285.21,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.54,63.0,0.0,0.83,7375.0,11.0,0.0,2.0,0.0,1.0


In [12]:
from xgboost import XGBClassifier

In [13]:
xgb=XGBClassifier()
grid={'n_estimators':range(20,501,20),'max_depth':range(1,6),'learning_rate':[.0001,.001,.01,.1,1,10],'n_jobs':[-1],
      'subsample':[.4,.5,.6,.7,.8,.9,1],'random_state':[0]}
xgb_grid=RandomizedSearchCV(xgb,grid,cv=5,scoring='roc_auc',n_iter=20,n_jobs=-1,random_state=24)
xgb_grid.fit(train_[varc+vard],train_[target])
model=xgb_grid.best_estimator_
model

In [14]:
roc_auc_score(train_[target],model.predict_proba(train_[varc+vard])[:,1])

0.8619901728865242

In [15]:
test_imp = test.copy()
test_imp[varc]=imp_c.transform(test[varc])
test_imp[vard]=imp_d.transform(test[vard])
test_imp.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
103371,0,0.016135,61.0,0.0,0.020947,4200.0,19.0,0.0,0.0,0.0,1.0
48446,1,0.718657,55.0,0.0,0.705937,12833.0,9.0,0.0,2.0,0.0,0.0
74172,0,0.168523,38.0,0.0,0.330087,4825.0,11.0,0.0,2.0,0.0,1.0
112404,0,0.233777,48.0,1.0,0.433428,6000.0,9.0,0.0,2.0,0.0,2.0
127704,0,0.999911,49.0,1.0,0.146971,5000.0,2.0,2.0,0.0,1.0,0.0


In [16]:
roc_auc_score(test_imp[target],model.predict_proba(test_imp[varc+vard])[:,1])

0.8690647216427749

In [17]:
pred=pd.read_csv(ruta+'/cs-test.csv')
pred['Id']=pred['Unnamed: 0']
pred[varc]=imp_c.transform(pred[varc])
pred[vard]=imp_d.transform(pred[vard])
pred['Probability']=model.predict_proba(pred[varc+vard])[:,1]
pred

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,Id,Probability
0,1,,0.885519,43.0,0.0,0.177513,5700.00000,4.0,0.0,0.0,0.0,0.0,1,0.073320
1,2,,0.463295,57.0,0.0,0.527237,9141.00000,15.0,0.0,4.0,0.0,2.0,2,0.063096
2,3,,0.043275,59.0,0.0,0.687648,5083.00000,12.0,0.0,1.0,0.0,2.0,3,0.016297
3,4,,0.280308,38.0,1.0,0.925961,3200.00000,7.0,0.0,2.0,0.0,0.0,4,0.073488
4,5,,1.000000,27.0,0.0,0.019917,3865.00000,4.0,0.0,0.0,0.0,1.0,5,0.095443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101498,101499,,0.282653,24.0,0.0,0.068522,1400.00000,5.0,0.0,0.0,0.0,0.0,101499,0.046608
101499,101500,,0.922156,36.0,3.0,0.934217,7615.00000,8.0,0.0,2.0,0.0,4.0,101500,0.354349
101500,101501,,0.081596,70.0,0.0,836.000000,6285.20711,3.0,0.0,0.0,0.0,0.0,101501,0.009356
101501,101502,,0.335457,56.0,0.0,3568.000000,6285.20711,8.0,0.0,2.0,1.0,3.0,101502,0.047492


In [18]:
pred[['Id','Probability']].to_csv('xgboost.csv', index=False)