## Name: Anjani Bonda
## Assignment: Week8 - Model Selection and Hyperparameter Tuning
## Date: July 30, 2022

In [1]:
# Import the required libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

1. Import the dataset and ensure that it loaded properly.

In [2]:
# Import the dataset
df = pd.read_csv("/Users/anjanibonda/Data-Science/DSC550/Week8_Model_Selection_Hyperparameter_Tuning/Loan_Train.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
# Check datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [4]:
# The column 'Dependents' is of type object which in reality should be integer. Rest of the columns look good.
# Check the values of Dependents

df['Dependents'].value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [7]:
# Remap the values of '3+' under 'Dependents' column
dep_map = {'0':'None','1':'One','2':'Two','3+':'More Than two'}
df['No_Dependents'] = df['Dependents'].map(dep_map)

In [8]:
df['No_Dependents'].value_counts()

None             345
One              102
Two              101
More Than two     51
Name: No_Dependents, dtype: int64

In [9]:
# We can delete Dependents and rename No_dependents as Dependents
df.drop('Dependents',axis=1,inplace=True)

2. Prepare the data for modeling by performing the following steps:

    Drop the column “Load_ID.”

    Drop any rows with missing data.

    Convert the categorical features into dummy variables.

In [10]:
df.drop('Loan_ID',axis=1,inplace=True)

In [11]:
# Check for missing data
df.isnull().sum()

Gender               13
Married               3
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
No_Dependents        15
dtype: int64

In [12]:
df.shape

(614, 12)

In [13]:
# Drop any rows with missing data
df = df.dropna()

In [14]:
df.shape

(480, 12)

In [15]:
# Convert the categorical features into dummy variables.
df = pd.get_dummies(df,drop_first=True)

In [16]:
df.shape

(480, 15)

In [17]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y,No_Dependents_None,No_Dependents_One,No_Dependents_Two
1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0,0,1,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,1,0,1,1,1,0,0
3,2583,2358.0,120.0,360.0,1.0,1,1,1,0,0,1,1,1,0,0
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,1,1,1,0,0
5,5417,4196.0,267.0,360.0,1.0,1,1,0,1,0,1,1,0,0,1


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 480 entries, 1 to 613
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          480 non-null    int64  
 1   CoapplicantIncome        480 non-null    float64
 2   LoanAmount               480 non-null    float64
 3   Loan_Amount_Term         480 non-null    float64
 4   Credit_History           480 non-null    float64
 5   Gender_Male              480 non-null    uint8  
 6   Married_Yes              480 non-null    uint8  
 7   Education_Not Graduate   480 non-null    uint8  
 8   Self_Employed_Yes        480 non-null    uint8  
 9   Property_Area_Semiurban  480 non-null    uint8  
 10  Property_Area_Urban      480 non-null    uint8  
 11  Loan_Status_Y            480 non-null    uint8  
 12  No_Dependents_None       480 non-null    uint8  
 13  No_Dependents_One        480 non-null    uint8  
 14  No_Dependents_Two        4

3. Split the data into a training and test set, where the “Loan_Status” column is the target.

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
# Prepare the features into X and y variables
X = df.drop('Loan_Status_Y',axis=1)
y = df['Loan_Status_Y']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

4. Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [23]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [24]:
# Instantiate the parameters
scaler = MinMaxScaler()
knn = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
pipe = Pipeline([("scaler",scaler),("knn",knn)])

5. Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model.

In [25]:
# Fit the model
kmodel = pipe.fit(X_train,y_train)

In [26]:
kmodel

In [27]:
y_pred = kmodel.predict(X_test)

In [34]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix

In [35]:
accuracy_score(y_test,y_pred)

0.7708333333333334

6. Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).

In [36]:
search_space = [{"knn__n_neighbors":[1,2,3,4,5,6,7,8,9,10]}]

7. Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [43]:
kmodel_2 = GridSearchCV(pipe,search_space,cv=5,verbose=0).fit(X_train,y_train)

In [44]:
kmodel_2

In [46]:
kmodel_2.best_estimator_

8. Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [47]:
y_pred2 = kmodel_2.predict(X_test)

In [48]:
accuracy_score(y_test,y_pred2)

0.7604166666666666

9. Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [50]:
# Create pipeline
pipe2 = Pipeline([("scaler",MinMaxScaler()),
                  ("classifier",RandomForestClassifier())])

In [51]:
search_space2 = [{"classifier":[LogisticRegression()],
                  "classifier__penalty":['l1','l2'],
                  "classifier__C":np.logspace(0,4,10)},
                 {"classifier":[KNeighborsClassifier()],
                  "classifier__n_neighbors":[1,2,3,4,5,6,7,8,9,10],
                  "classifier__weights":['uniform','distance']},
                 {"classifier":[RandomForestClassifier()],
                  "classifier__n_estimators":[10,100,1000],
                  "classifier__max_features": [1,2,3],
                  "classifier__max_depth":[5,10,None]}]

In [52]:
Clf = GridSearchCV(pipe2,search_space2,cv=5,verbose=0)
Clf.fit(X_train,y_train)

50 fits failed out of a total of 335.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/anjanibonda/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/anjanibonda/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/anjanibonda/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/anjanibonda/opt/anaconda3

10. What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [53]:
Clf.best_estimator_

In [54]:
# Instantiate the parameters

scaler = MinMaxScaler()
Log_pipe = Pipeline([("scaler",scaler),("classifier",LogisticRegression(C=2.7826))])

In [55]:
# Fit the model
BestModel = Log_pipe.fit(X_train,y_train)

In [56]:
y_pred3 = BestModel.predict(X_test)

In [57]:
accuracy_score(y_test,y_pred3)

0.8229166666666666

11. Summarize your results.

Using the K-Nearest Neighbor with n=5 yields accuracy score of 77.08%, when used a gridsearch to search for the optimal number of n, 10 was picked but yielded poor result of 76.04% which is less than that of n=5. 
Expanding the search space to include 2 other algorithms: RandomForest and LogisticRegression, and allowing GridSearch to perform the hyperparameter tuning, Logistic regression model is returned as the best model with C = 2.7826.

Therefore, Fitting logistic regression model on this dataset produced a higher accuracy score of 82.3%