# DSC 550 

### Week 8:  Exercise 8.2

Author: Kimberly Cable<br>
Date: 7-30-2022

## Exercise 8.2: Best model Selection and Hyperparameter Tuning

In [1]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.preprocessing import MinMaxScaler 
from sklearn.pipeline import Pipeline, FeatureUnion 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

**1. Import the dataset and ensure that it loaded properly.**

In [2]:
# Import Loan Training data
loan_df = pd.read_csv("Loan_Train.csv")
loan_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
loan_df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [4]:
loan_df.describe(include = ['O'])

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
count,614,601,611,599,614,582,614,614
unique,614,2,2,4,2,2,3,2
top,LP001002,Male,Yes,0,Graduate,No,Semiurban,Y
freq,1,489,398,345,480,500,233,422


**2. Prepare the data for modeling by performing the following steps:**
* Drop the column “Loan_ID.”
* Drop any rows with missing data.
* Convert the categorical features into dummy variables.

In [5]:
# drop Load_ID
loan_df.drop('Loan_ID', axis = 1, inplace = True)

In [6]:
loan_df.shape

(614, 12)

In [7]:
loan_df.dropna(inplace = True)
loan_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [8]:
loan_df.shape

(480, 12)

In [9]:
# Select categorical columns
categorical_columns = loan_df.select_dtypes('object').columns.difference(['Loan_Status'])
categorical_columns

Index(['Dependents', 'Education', 'Gender', 'Married', 'Property_Area',
       'Self_Employed'],
      dtype='object')

In [10]:
loan_dummies = pd.get_dummies(loan_df, columns = categorical_columns)
loan_dummies.shape

(480, 21)

In [11]:
loan_dummies.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,...,Education_Not Graduate,Gender_Female,Gender_Male,Married_No,Married_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Self_Employed_No,Self_Employed_Yes
1,4583,1508.0,128.0,360.0,1.0,N,0,1,0,0,...,0,0,1,0,1,1,0,0,1,0
2,3000,0.0,66.0,360.0,1.0,Y,1,0,0,0,...,0,0,1,0,1,0,0,1,0,1
3,2583,2358.0,120.0,360.0,1.0,Y,1,0,0,0,...,1,0,1,0,1,0,0,1,1,0
4,6000,0.0,141.0,360.0,1.0,Y,1,0,0,0,...,0,0,1,1,0,0,0,1,1,0
5,5417,4196.0,267.0,360.0,1.0,Y,0,0,1,0,...,0,0,1,0,1,0,0,1,0,1


In [12]:
# encode Loan_Status
loan_status = {'N': 0, 'Y': 1}
loan_dummies = loan_dummies.replace(loan_status)
loan_dummies.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,...,Education_Not Graduate,Gender_Female,Gender_Male,Married_No,Married_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Self_Employed_No,Self_Employed_Yes
1,4583,1508.0,128.0,360.0,1.0,0,0,1,0,0,...,0,0,1,0,1,1,0,0,1,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,...,0,0,1,0,1,0,0,1,0,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,...,1,0,1,0,1,0,0,1,1,0
4,6000,0.0,141.0,360.0,1.0,1,1,0,0,0,...,0,0,1,1,0,0,0,1,1,0
5,5417,4196.0,267.0,360.0,1.0,1,0,0,1,0,...,0,0,1,0,1,0,0,1,0,1


**3. Split the data into a training and test set, where the “Loan_Status” column is the target.**

In [13]:
features = loan_dummies.loc[:, loan_dummies.columns != 'Loan_Status']
target = loan_dummies['Loan_Status']

In [14]:
# Split data into training and test data
features_train, features_test, target_train, target_test = train_test_split(features, target, 
                                                                            test_size = 0.2, random_state = 15)

In [15]:
print(f"features_train: {features_train.shape}")
print(f"features_test: {features_test.shape}")
print(f"target_train: {target_train.shape}")
print(f"target_test: {target_test.shape}")

features_train: (384, 20)
features_test: (96, 20)
target_train: (384,)
target_test: (96,)


**4. Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).**

In [16]:
# Create Min-Max Scaler
standardizer = MinMaxScaler()

In [17]:
# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs = -1)

In [18]:
# Create a pipeline
pipe = Pipeline([('standarizer', standardizer), ('knn', knn)])

**5. Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.**

In [19]:
# Fit the training data
pipe.fit(features_train, target_train)

Pipeline(steps=[('standarizer', MinMaxScaler()),
                ('knn', KNeighborsClassifier(n_jobs=-1))])

In [20]:
# Get accuracy of test data
pipe.score(features_test, target_test)

0.7395833333333334

**6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).**

In [21]:
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

**7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.**

In [22]:
classifier_knn = GridSearchCV(pipe, search_space, cv = 5, verbose = 0).fit(features_train, target_train)

**8. Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.**

In [23]:
classifier_knn.score(features_test, target_test)

0.7083333333333334

**9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.**

In [24]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

In [25]:
# Create dictionary with candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression(max_iter = 1000)],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]}]

In [26]:
# Create grid search
gridsearch_all = GridSearchCV(pipe, search_space, cv = 5, verbose = 0).fit(features_train, target_train)

50 fits failed out of a total of 145.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Osgiliath\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Osgiliath\anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\Osgiliath\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Osgiliath\anaconda3\lib\site-packages\sklearn\linear_model\_logis

**10. What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.**

In [27]:
gridsearch_all.best_params_

{'classifier': LogisticRegression(C=7.742636826811269, max_iter=1000),
 'classifier__C': 7.742636826811269,
 'classifier__penalty': 'l2'}

In [28]:
gridsearch_all.score(features_test, target_test)

0.8125

**11. Summarize your results.**

Using a KNN classifier the accuracy score was 0.71 but with a Logistic Regression model the accuracy rose to 0.81. This indicates the Logistic Regression is a better model to correctly predict the loan status.