## Using GB and nearest neighbour imputator
This notebook contains the code for using Gradient boosting and nearest neighbours model

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/cs-training.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,149998,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [4]:
x = df.iloc[:,2:].values
x

array([[ 0.76612661, 45.        ,  2.        , ...,  6.        ,
         0.        ,  2.        ],
       [ 0.95715102, 40.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.65818014, 38.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.24604392, 58.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , 30.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.85028295, 64.        ,  0.        , ...,  2.        ,
         0.        ,  0.        ]])

In [5]:
# Replacing nan values with iterative imputer
import numpy as np
from sklearn.impute import KNNImputer

imp_mean = KNNImputer()
x = imp_mean.fit_transform(x)
x

array([[ 0.76612661, 45.        ,  2.        , ...,  6.        ,
         0.        ,  2.        ],
       [ 0.95715102, 40.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.65818014, 38.        ,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.24604392, 58.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , 30.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.85028295, 64.        ,  0.        , ...,  2.        ,
         0.        ,  0.        ]])

In [6]:
y = df.iloc[:,1].astype("uint8")
y

0         1
1         0
2         0
3         0
4         0
         ..
149995    0
149996    0
149997    0
149998    0
149999    0
Name: SeriousDlqin2yrs, Length: 150000, dtype: uint8

In [7]:
from sklearn.metrics import auc, roc_auc_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import pearsonr
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
import numpy as np
from sklearn.model_selection import KFold


In [8]:
# Regression models for comparison, let's try 300 estimators
models = [ 
          GradientBoostingRegressor(n_estimators=300), 
         ]

n_splits = 5
shuffle = True


# splitting the dataset
kf = KFold(n_splits=n_splits, shuffle=shuffle)
kf.get_n_splits(x)

results = {}

for model in models:  
    model_auc = []
    for train_index, test_index in kf.split(x):
        # Instantiate and fit Regressor Model
        reg_model = model
        reg_model.fit(x[train_index], y[train_index])

        # Make predictions with model
        y_test_preds = reg_model.predict(x[test_index])

        # Grab model name and store results associated with model
        name = str(model).split("(")[0]
        auc_res = roc_auc_score(y[test_index], y_test_preds)
        
        model_auc.append(auc_res)
    results[name] = "{:.4f}".format(np.mean(model_auc))

In [9]:
results

{'GradientBoostingRegressor': '0.8643'}

# Predict on test

For this purpose, I will train on the whole data and predict on the test

In [10]:
final_model = model.fit(x, y)

In [11]:
df_test = pd.read_csv("../data/cs-test.csv")

In [12]:
df_test

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,5,,1.000000,27,0,0.019917,3865.0,4,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
101498,101499,,0.282653,24,0,0.068522,1400.0,5,0,0,0,0.0
101499,101500,,0.922156,36,3,0.934217,7615.0,8,0,2,0,4.0
101500,101501,,0.081596,70,0,836.000000,,3,0,0,0,
101501,101502,,0.335457,56,0,3568.000000,,8,0,2,1,3.0


In [None]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [13]:
x_test = df_test.iloc[:,2:] # selecting all the columns except the first 2
# Replacing nan values with iterative imputer
imp_mean = IterativeImputer()
x_test = imp_mean.fit_transform(x_test)
x_test

array([[8.85519080e-01, 4.30000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.63295269e-01, 5.70000000e+01, 0.00000000e+00, ...,
        4.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       [4.32750360e-02, 5.90000000e+01, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 2.00000000e+00],
       ...,
       [8.15963730e-02, 7.00000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 2.47675398e-01],
       [3.35456547e-01, 5.60000000e+01, 0.00000000e+00, ...,
        2.00000000e+00, 1.00000000e+00, 3.00000000e+00],
       [4.41841663e-01, 2.90000000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [14]:
predictions = pd.DataFrame(final_model.predict(x_test)) # predicting the results and saving it as df

predictions[predictions <= 0] = 0 # replace values less than 0 with 0
predictions[predictions >= 1] = 1 # replace values more than 1 with 1

predictions['Id'] = df_test["Unnamed: 0"] # adding the Id to the df
predictions.columns = ["Probability", "Id"] # renaming it to Id

# Switching the columns 
columns_titles = ["Id","Probability"]
predictions=predictions.reindex(columns=columns_titles)


In [15]:
predictions

Unnamed: 0,Id,Probability
0,1,0.071931
1,2,0.041822
2,3,0.016305
3,4,0.076266
4,5,0.103605
...,...,...
101498,101499,0.031508
101499,101500,0.347268
101500,101501,0.000000
101501,101502,0.126650


In [16]:
predictions.to_csv("output_iterative_imputations.csv", index=False) # save outputs to CSV for submission