In [37]:
#load libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
from sklearn import preprocessing, datasets
labelencoder = LabelEncoder()
label_encoder = preprocessing.LabelEncoder()
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

Import the dataset and ensure that it loaded properly.

In [38]:
#adding data to a dataframe
df = pd.read_csv("Loan_Train.csv")

In [39]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Prepare the data for modeling by performing the following steps:
Drop the column “Loan_ID.”
Drop any rows with missing data.
Convert the categorical features into dummy variables.

In [40]:
#dropping the loan_id column
df = df.drop('Loan_ID', axis=1)

In [41]:
#looking at columns with missing data
df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [42]:
#removing any null values
df = df.dropna(axis = 0, how ='any')
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [43]:
df['Loan_Status'] =  label_encoder.fit_transform(df['Loan_Status'])

In [44]:
#keeping a list of the column names
df_col = list(df.columns)

In [45]:
#for loop to transform data to numerical and keep it in the same column
for i in range(len(df_col)):
    df[df_col[i]] = LabelEncoder().fit_transform(df[df_col[i]])

In [46]:
#verifying that all categories are numerical/dummy variables
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,0,0,247,50,76,7,1,0,0
2,1,1,0,0,1,112,0,23,7,1,2,1
3,1,1,0,1,0,74,135,68,7,1,2,1
4,1,0,0,0,0,305,0,89,7,1,2,1
5,1,1,2,0,1,281,196,159,7,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,101,0,26,7,1,0,1
610,1,1,3,0,0,219,0,7,4,1,0,1
611,1,1,1,0,0,344,3,154,7,1,2,1
612,1,1,2,0,0,336,0,126,7,1,2,1


Split the data into a training and test set, where the “Loan_Status” column is the target.

In [47]:
#splitting data into one with just loan_status and one without
x = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status']

In [48]:
#train/test split with 30/70 split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

Create a pipeline with a min-max scaler and a KNN classifier (see section 15.3 in the Machine Learning with Python Cookbook).

In [49]:
#create knn classifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs = -1)

In [50]:
#create a pipeline
pipe = Pipeline([('scaler', MinMaxScaler()), ("knn", knn)])

Fit a default KNN classifier to the data with this pipeline. Report the model accuracy on the test set. Note: Fitting a pipeline model works just like fitting a regular model

In [51]:
#fitting pipeline model
pipe.fit(x_train, y_train)

In [52]:
#accuracy (0.77)
pipe.score(x_test, y_test)

0.7708333333333334

Create a search space for your KNN classifier where your “n_neighbors” parameter varies from 1 to 10. (see section 15.3 in the Machine Learning with Python Cookbook).

In [53]:
#create space of candidate values
search_space = [{'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10]}]

Fit a grid search with your pipeline, search space, and 5-fold cross-validation to find the best value for the “n_neighbors” parameter.

In [54]:
#create grid search
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=1).fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [55]:
#best neighbor size(k) (10)
classifier.best_estimator_.get_params()["knn__n_neighbors"]

10

Find the accuracy of the grid search best model on the test set. Note: It is possible that this will not be an improvement over the default model, but likely it will be.

In [56]:
#accuracy of grid search (0.79)
classifier.best_score_

0.7947322212467076

Now, repeat steps 6 and 7 with the same pipeline, but expand your search space to include logistic regression and random forest models with the hyperparameter values in section 12.3 of the Machine Learning with Python Cookbook.

In [57]:
# create a dictionary with candidate learning algorithms and their hyperparameters
search_space2 = [{
    "knn": [LogisticRegression()],
    "knn__penalty": ['l2'],
    "knn__C": np.logspace(0, 4, 10)
}, {
    "knn": [RandomForestClassifier()],
    "knn__n_estimators": [10, 100, 1000],
    "knn__max_features": [1, 2, 3]
}, {
    "knn": [KNeighborsClassifier()],
    "knn__n_neighbors": range(1, 11),
    "knn__weights": ['uniform', 'distance']
}]

In [58]:
#create gridsearch
gridsearch = GridSearchCV(pipe, search_space2, cv=5, verbose=0)

What are the best model and hyperparameters found in the grid search? Find the accuracy of this model on the test set.

In [59]:
#fit gridsearch
best_model = gridsearch.fit(x_train, y_train)
best_model

In [60]:
best_model.best_params_

{'knn': RandomForestClassifier(max_features=3),
 'knn__max_features': 3,
 'knn__n_estimators': 100}

In [61]:
#another coding method for best model from grid search
best_model.best_estimator_.get_params()['knn']

In [62]:
#accuracy of this model (0.82)
best_model.best_score_

0.8215978928884986

Summarize your results.

After our train/test split, we started by fititng to a min/max scaler. This will make sure that the data is all within a fixed range and contributes equally to the analysis. Then we found out the knn for the data. The best neighbors number is 10 to determine the class of a data point. Our estimator found out that 10 neighbors would give us our most accurate data, without overfitting or over simplfying the data,  which was 79%. Next we tested 3 different models to find out which one would give us the best accuracy. With this, we found that the random forest classifier was the most accurate one, at 82%. This was done using a five fold cross validation. Random forest ends up being the most accurate model with n_estimators of 1000. This was the highest option given and it means that we have increased performance and makes the predictions more stable, and that it won't overfit the model, but this can make the algorithm slow. The max features is two, which is the number of features to consider for the best split. 