# Finding best parameters using GridSearchCV

## Navigation
<ul>
<li><a href="#grid">Grid Search to find best parameter</a></li>
<li><a href="#lr">Logistic Regression</a></li>
<li><a href="#knn">K-Nearest Neighbors</a></li>
<li><a href="#rf">Random Forest</a></li>
<li><a href="#xgb">XGBoost</a></li>
    </ul>

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import imblearn.over_sampling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
%matplotlib inline

import time

df = pd.read_csv("../data/processed/clean.csv")

In [43]:
# split data
X = df.drop("LOAN_DEFAULT" , axis = 1)
y = df["LOAN_DEFAULT"]
X_train , X_val , y_train , y_val = train_test_split(X, y, test_size = 0.2, random_state=7)
# X_train_sub_set , X_vald_sub_set , y_train_sub_set , y_vald_sub_set = train_test_split(x[0:1000], y[0:1000] , test_size = 0.2, random_state=42)

In [49]:
####### SUB #########
X_train_sub , X_val_sub , y_train_sub , y_val_sub = train_test_split(X, y, test_size = 0.5, random_state=7)

# ROS ratio argument
n_pos_train = np.sum(y_train_sub == 1)
n_neg_train = np.sum(y_train_sub == 0)

ratio = {1 : n_pos_train * 4, 0 : n_neg_train} 


# randomly oversample positive samples 4 times

smote = imblearn.over_sampling.SMOTE(sampling_strategy=ratio, random_state = 10)
    
X_tr_smote, y_tr_smote = smote.fit_resample(X_train_sub, y_train_sub)

  f"After over-sampling, the number of samples ({n_samples})"


## Scale data using Z score

In [34]:
# scaling data
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_val_scaled = ss.transform(X_val)

# GridSearch finding best parameters
<a id='grid'></a>
<a href="#">Back to top</a>

> "SUB" tags means that the execution of GridSearchCV on this part has taken subset of data and not all based on 0.5 split to speed the process

## Logistics Regression
<a id='lr'></a>
<a href="#">Back to top</a>

In [None]:
# using F1 score
param_grid = {
      'C' : np.linspace(0.01,50,100),
    'solver': ['lbfgs', 'saga'],
      'penalty' : ['l1', 'l2'],
    
}

lr = LogisticRegression(n_jobs=-1)
lr.fit(ss.fit_transform(X_train_scaled), y_train)

grid = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1)
grid.fit(X_train , y_train)
# view the complete results
# print("grid_cv" , grid.cv_results_)
# examine the best model
print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

print("*"*50)
print("*"*50)

# using accuracy
param_grid = {
      'C' : np.linspace(0.01,50,100),
    'solver': ['lbfgs', 'saga'],
      'penalty' : ['l1', 'l2'],
    
}

model = LogisticRegression(n_jobs=-1)
model.fit(ss.fit_transform(X_train), y_train)

grid = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1)
grid.fit(X_train , y_train)
## view the complete results
## print("grid_cv" , grid.cv_results_)
## examine the best model
print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

In [52]:
################### SUB ###################
param_grid = {
      'C' : np.linspace(0.01,50,100),
    'solver': ['lbfgs', 'saga'],
      'penalty' : ['l1', 'l2'],
    
}

model = LogisticRegression(n_jobs=-1)
model.fit(ss.fit_transform(X_tr_smote), y_tr_smote)

grid = GridSearchCV(model, param_grid, scoring='accuracy', cv=2, n_jobs=-1)
grid.fit(X_tr_smote , y_tr_smote)
## view the complete results
## print("grid_cv" , grid.cv_results_)
## examine the best model
print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

200 fits failed out of a total of 800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "/home/abdulium/miniconda3/envs/t5/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/abdulium/miniconda3/envs/t5/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/abdulium/miniconda3/envs/t5/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or '

Best params:  {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best estimator:  LogisticRegression(C=0.01, n_jobs=-1)
Best score:  0.5470481440890926


## K-Nearest Neighbor
<a id='knn'></a>
<a href="#">Back to top</a>

In [17]:
start = time.time()

# define param_grid
param_grid = {
    'n_neighbors': list(range(3,26)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# grid search
grid = GridSearchCV(KNeighborsClassifier(), param_grid, verbose =1, n_jobs=-1)

grid.fit(X_train_scaled, y_train)

print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

end = time.time()
print((end - start)/60)

Fitting 5 folds for each of 92 candidates, totalling 460 fits
Best params:  {'metric': 'euclidean', 'n_neighbors': 24, 'weights': 'uniform'}
Best estimator:  KNeighborsClassifier(metric='euclidean', n_neighbors=24)
Best score:  0.7773039734622502
3961.502706050873


In [57]:
################### SUB ###################
start = time.time()

# define param_grid
param_grid = {
    'n_neighbors': list(range(3,26)),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# grid search
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=2, verbose =1, n_jobs=-1)

grid.fit(ss.fit_transform(X_tr_smote), y_tr_smote)

print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

end = time.time()
print((end - start)/60)

Fitting 2 folds for each of 92 candidates, totalling 184 fits
Best params:  {'metric': 'manhattan', 'n_neighbors': 22, 'weights': 'distance'}
Best estimator:  KNeighborsClassifier(metric='manhattan', n_neighbors=22, weights='distance')
Best score:  0.6597422868719073
51.98113213380178


## Random Forest
<a id='rf'></a>
<a href="#">Back to top</a>

In [31]:
start = time.time()

# define param_grid
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [5, 15, 25],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10]

}

# grid search
grid = GridSearchCV(RandomForestClassifier(random_state=7), param_grid, verbose =1, n_jobs=-1)

grid.fit(X_train, y_train)

print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

end = time.time()
print((end - start)/60)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best params:  {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best estimator:  RandomForestClassifier(max_depth=15, min_samples_leaf=2, min_samples_split=10,
                       random_state=7)
Best score:  0.7791774911012929
33.56065210898717


In [56]:
################### SUB ###################
start = time.time()

# define param_grid
param_grid = {
    "n_estimators": list(range(100,300,50)),
    "max_depth": list(range(5,30,5)),
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [2, 5, 10]

}

# grid search
grid = GridSearchCV(RandomForestClassifier(random_state=7), param_grid, cv=2, verbose =1, n_jobs=-1)

grid.fit(X_tr_smote, y_tr_smote)

print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

end = time.time()
print((end - start)/60)

Fitting 2 folds for each of 180 candidates, totalling 360 fits
Best params:  {'max_depth': 25, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 250}
Best estimator:  RandomForestClassifier(max_depth=25, min_samples_leaf=2, n_estimators=250,
                       random_state=7)
Best score:  0.6726113380492178
11.624450532595317


## XGBoost
<a id='xgb'></a>
<a href="#">Back to top</a>

In [41]:
start = time.time()

param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [5, 15, 25],
    "learning_rate": np.linspace(0.1, 1, 15),
    "objective":["binary:logistic"]

}

# grid search
grid = GridSearchCV(XGBClassifier(random_state=7), param_grid, verbose =1, n_jobs=-1)

grid.fit(X_train, y_train)

print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

end = time.time()
print((end - start)/60)

Fitting 5 folds for each of 135 candidates, totalling 675 fits




Best params:  {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'objective': 'binary:logistic'}
Best estimator:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=7,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Best score:  0.7788039359748878
261.58992855151496


In [61]:
################### SUB ###################

start = time.time()

param_grid = {
    "n_estimators": list(range(100,300,50)),
    "max_depth": [5, 15, 25],
    "learning_rate": np.linspace(0.1, 1, 5)

}

# grid search
grid = GridSearchCV(XGBClassifier(random_state=7), param_grid, cv=2, verbose =1, n_jobs=-1)

grid.fit(X_tr_smote, y_tr_smote)

print("Best params: ", grid.best_params_)
print("Best estimator: ", grid.best_estimator_)
print("Best score: ", grid.best_score_)

end = time.time()
print((end - start)/60)

Fitting 2 folds for each of 60 candidates, totalling 120 fits




Best params:  {'learning_rate': 0.325, 'max_depth': 25, 'n_estimators': 250}
Best estimator:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.325, max_delta_step=0,
              max_depth=25, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=250, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=7,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)
Best score:  0.7191502551669933
26.475709172089896
