## Finding important features
This notebook contains codes on training some models based on the columns that previous RF model had found important.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/cs-training.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,149998,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [4]:
[df[col].fillna(df[col].median(), inplace=True) for col in df.columns]

[None, None, None, None, None, None, None, None, None, None, None, None]

In [5]:
df

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,149998,0,0.246044,58,0,3870.000000,5400.0,18,0,1,0,0.0
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [6]:
df.dtypes

Unnamed: 0                                int64
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

In [7]:
x = df[["age", "NumberOfTimes90DaysLate", "MonthlyIncome", "RevolvingUtilizationOfUnsecuredLines", "DebtRatio"]].values
x

array([[4.50000000e+01, 0.00000000e+00, 9.12000000e+03, 7.66126609e-01,
        8.02982129e-01],
       [4.00000000e+01, 0.00000000e+00, 2.60000000e+03, 9.57151019e-01,
        1.21876201e-01],
       [3.80000000e+01, 1.00000000e+00, 3.04200000e+03, 6.58180140e-01,
        8.51133750e-02],
       ...,
       [5.80000000e+01, 0.00000000e+00, 5.40000000e+03, 2.46043918e-01,
        3.87000000e+03],
       [3.00000000e+01, 0.00000000e+00, 5.71600000e+03, 0.00000000e+00,
        0.00000000e+00],
       [6.40000000e+01, 0.00000000e+00, 8.15800000e+03, 8.50282951e-01,
        2.49908077e-01]])

In [8]:
y = df.iloc[:,1].astype("uint8")
y

0         1
1         0
2         0
3         0
4         0
         ..
149995    0
149996    0
149997    0
149998    0
149999    0
Name: SeriousDlqin2yrs, Length: 150000, dtype: uint8

In [9]:
from sklearn.metrics import auc, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import KFold


In [10]:
# Regression models for comparison
models = [
          RandomForestRegressor(),
          GradientBoostingRegressor()
         ]

n_splits = 5
shuffle = True

# splitting the dataset
kf = KFold(n_splits=n_splits, shuffle=shuffle)
kf.get_n_splits(x)

results = {}

for model in models:  
    model_auc = []
    for train_index, test_index in kf.split(x):
        # Instantiate and fit Regressor Model
        reg_model = model
        reg_model.fit(x[train_index], y[train_index])

        # Make predictions with model
        y_test_preds = reg_model.predict(x[test_index])

        # Grab model name and store results associated with model
        name = str(model).split("(")[0]
        auc_res = roc_auc_score(y[test_index], y_test_preds)
        
        model_auc.append(auc_res)
    results[name] = "{:.4f}".format(np.mean(model_auc))

In [12]:
results

{'RandomForestRegressor': '0.7861', 'GradientBoostingRegressor': '0.8314'}

# Results
The idea was not good :(