# Cross-Validation and Hyper-Parameter Tuning

In [0]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Memory management
import gc 

In [0]:
#!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
#!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
#!apt-get update -qq 2>&1 > /dev/null
#!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

In [0]:
!google-drive-ocamlfuse drive
!ls drive/"Colab Notebooks"

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option
 all-HCDR
'COLAB HCDR.ipynb'
 HCDR2.ipynb
'Home Credit Default Risk (d66df98a).ipynb'
'Home Credit Default Risk.ipynb'
 log_reg_baseline.csv
 log_reg_corrs_removed_with_bureau.csv
 log_reg_eng.csv
 log_reg_with_bureau.csv
 random_forest_baseline.csv
 random_forest_removed_with_bureau.csv
 random_forest_with_bureau.csv


In [0]:
train = pd.read_csv('drive/Colab Notebooks/all-HCDR/train_bureau_raw.csv')
test = pd.read_csv('drive/Colab Notebooks/all-HCDR/test_bureau_raw.csv')

In [0]:
# Create a label encoder object
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in train:
    if train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(train[col].unique())) <= 2:
            # Train on the training data
            le.fit(train[col])
            # Transform both training and testing data
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [0]:
# one-hot encoding of categorical variables
train = pd.get_dummies(train)
test = pd.get_dummies(test)

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

Training Features shape:  (307511, 458)
Testing Features shape:  (48744, 454)


In [0]:
train_labels = train['TARGET']

# Align the dataframes, this will remove the 'TARGET' column
train, test = train.align(test, join = 'inner', axis = 1)

#train['TARGET'] = train_labels
print('Training Data Shape: ', train.shape)
print('Testing Data Shape: ', test.shape)

Training Data Shape:  (307511, 454)
Testing Data Shape:  (48744, 454)


In [0]:
from sklearn.preprocessing import MinMaxScaler, Imputer

imputer = Imputer(strategy = 'median')

scaler = MinMaxScaler(feature_range = (0,1))

imputer.fit(train)

train = imputer.transform(train)

scaler.fit(train)
train = scaler.transform(train)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 1, random_state=23)

scores = cross_val_score(log_reg, train, train_labels, cv=10, scoring='accuracy')
print(scores)
print("Mean: {}".format(scores.mean()))
print("Std:  {}".format(scores.std()))

[0.91886707 0.91938736 0.91919225 0.91886707 0.91945239 0.91938474
 0.91918699 0.91941463 0.9195122  0.91925203]
Mean: 0.9192516718146552
Std:  0.00021780574587714404


In [0]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train, 
                                                    train_labels, 
                                                    test_size=0.3, 
                                                    random_state=4330)

In [0]:
alpha = [0.01, 0.1, 1, 10, 100]

for a in alpha:
    logistic = LogisticRegression(C = a, random_state=23).fit(x_train, y_train)
    print('alpha:', a)
    print(logistic.score(x_train, y_train), logistic.score(x_test, y_test))
    print()

alpha: 0.01
0.9192778864334261 0.91919049580506

alpha: 0.1
0.9193104057010922 0.9190495805059943

alpha: 1
0.9193429249687582 0.91919049580506

alpha: 10
0.9194033178944239 0.919125457974722

alpha: 100
0.9193707986267577 0.919157976889891



# Заключение

Крос-валидацията ни показва, че модела е стабилен и няма overfitting. Промяната на регулизационния параметър не води до съществени изменения в резултатите.