In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv("Classification_Loan_Data.csv")
df.head()

Unnamed: 0,ID,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property,Loan_Status
0,1,5849,0.0,,360.0,1.0,Male,No,0,Graduate,No,Urban,Y
1,2,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural,N
2,3,3000,0.0,66.0,360.0,1.0,Male,Yes,0,Graduate,Yes,Urban,Y
3,4,2583,2358.0,120.0,360.0,1.0,Male,Yes,0,Not Graduate,No,Urban,Y
4,5,6000,0.0,141.0,360.0,1.0,Male,No,0,Graduate,No,Urban,Y


In [3]:
df.shape

(614, 13)

#### Missing values

In [4]:
df.isna().sum()

ID                            0
Income_of_Applicant           0
Income_of_Joint_Applicant     0
Loan_Amount_Requirement      22
Loan_Amount_Term             14
Credit_History               50
Gender                       13
Is_Married                    3
No_of_Dependents             15
Level_of_Education            0
IS_Self_Employed             32
Area_of_Property              0
Loan_Status                   0
dtype: int64

In [5]:
# Dropping the missing values for column 'Is_Married' as there are only 3 missing values
df.dropna(subset=['Is_Married'], inplace = True)

In [6]:
df.Loan_Status.value_counts()

Y    419
N    192
Name: Loan_Status, dtype: int64

#### Here we see that there is significant imbalance between Class 'Y' and Class 'X'
#### Therefore, we will need to undersample the Majority class to 340 and also oversample the Minority class to 260

In [7]:
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

#### Using undersample ratio of (192/340)

In [8]:
under_sampler = RandomUnderSampler(sampling_strategy = (192/340))

In [9]:
# Segregating into Predictor(X) variables and Target(y) variables
X = df.iloc[:, 0:12]
y = df.iloc[:, 12]
X, y

(      ID  Income_of_Applicant  Income_of_Joint_Applicant  \
 0      1                 5849                        0.0   
 1      2                 4583                     1508.0   
 2      3                 3000                        0.0   
 3      4                 2583                     2358.0   
 4      5                 6000                        0.0   
 ..   ...                  ...                        ...   
 609  610                 2900                        0.0   
 610  611                 4106                        0.0   
 611  612                 8072                      240.0   
 612  613                 7583                        0.0   
 613  614                 4583                        0.0   
 
      Loan_Amount_Requirement  Loan_Amount_Term  Credit_History  Gender  \
 0                        NaN             360.0             1.0    Male   
 1                      128.0             360.0             1.0    Male   
 2                       66.0            

In [10]:
# Value distribution of Target class after Undersampling

X, y = under_sampler.fit_resample(X, y)
print(y.value_counts())
print('---------------')
X.head()

Y    340
N    192
Name: Loan_Status, dtype: int64
---------------


Unnamed: 0,ID,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property
0,2,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural
1,8,3036,2504.0,158.0,360.0,0.0,Male,Yes,3+,Graduate,No,Semiurban
2,10,12841,10968.0,349.0,360.0,1.0,Male,Yes,1,Graduate,No,Semiurban
3,14,1853,2840.0,114.0,360.0,1.0,Male,No,0,Graduate,No,Rural
4,18,3510,0.0,76.0,360.0,0.0,Female,No,0,Graduate,No,Urban


#### Using oversample ratio of (260/340)

In [11]:
over_sampler = RandomOverSampler(sampling_strategy = (260/340))

In [12]:
# Value distribution of Target class after Oversampling

X, y = over_sampler.fit_resample(X, y)
print(X.shape)
print('-----')
print(y.shape)
print('-----')
print(y.value_counts())
print('-----')
X.head()

(600, 12)
-----
(600,)
-----
Y    340
N    260
Name: Loan_Status, dtype: int64
-----


Unnamed: 0,ID,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property
0,2,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural
1,8,3036,2504.0,158.0,360.0,0.0,Male,Yes,3+,Graduate,No,Semiurban
2,10,12841,10968.0,349.0,360.0,1.0,Male,Yes,1,Graduate,No,Semiurban
3,14,1853,2840.0,114.0,360.0,1.0,Male,No,0,Graduate,No,Rural
4,18,3510,0.0,76.0,360.0,0.0,Female,No,0,Graduate,No,Urban


In [13]:
X['Loan_Status'] = y
df = X.copy(deep = True)

In [14]:
df.head(15)

Unnamed: 0,ID,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property,Loan_Status
0,2,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural,N
1,8,3036,2504.0,158.0,360.0,0.0,Male,Yes,3+,Graduate,No,Semiurban,N
2,10,12841,10968.0,349.0,360.0,1.0,Male,Yes,1,Graduate,No,Semiurban,N
3,14,1853,2840.0,114.0,360.0,1.0,Male,No,0,Graduate,No,Rural,N
4,18,3510,0.0,76.0,360.0,0.0,Female,No,0,Graduate,No,Urban,N
5,19,4887,0.0,133.0,360.0,1.0,Male,Yes,0,Not Graduate,No,Rural,N
6,21,7660,0.0,104.0,360.0,0.0,Male,Yes,0,Not Graduate,No,Urban,N
7,23,2600,1911.0,116.0,360.0,0.0,Male,Yes,0,Not Graduate,No,Semiurban,N
8,24,3365,1917.0,112.0,360.0,0.0,,Yes,2,Not Graduate,No,Rural,N
9,25,3717,2925.0,151.0,360.0,,Male,Yes,1,Graduate,,Semiurban,N


## Missing Value Handling
### Methods :-
#### 1. Imputation
####     1. a. Categorical values --> Mode of column
####     1. b. Numerical values --> Mean of column
#### 2. KNN Imputer (Numerical columns)
####     2. a. K=1
####     2. b. K=2
####     2. c. K=3

In [15]:
print(df.isna().sum())
df.head()

ID                            0
Income_of_Applicant           0
Income_of_Joint_Applicant     0
Loan_Amount_Requirement      24
Loan_Amount_Term             16
Credit_History               44
Gender                       13
Is_Married                    0
No_of_Dependents             13
Level_of_Education            0
IS_Self_Employed             34
Area_of_Property              0
Loan_Status                   0
dtype: int64


Unnamed: 0,ID,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Loan_Amount_Term,Credit_History,Gender,Is_Married,No_of_Dependents,Level_of_Education,IS_Self_Employed,Area_of_Property,Loan_Status
0,2,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural,N
1,8,3036,2504.0,158.0,360.0,0.0,Male,Yes,3+,Graduate,No,Semiurban,N
2,10,12841,10968.0,349.0,360.0,1.0,Male,Yes,1,Graduate,No,Semiurban,N
3,14,1853,2840.0,114.0,360.0,1.0,Male,No,0,Graduate,No,Rural,N
4,18,3510,0.0,76.0,360.0,0.0,Female,No,0,Graduate,No,Urban,N


### Imputation with 'Mode' and 'Mean'

In [16]:
from sklearn.impute import SimpleImputer
si_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
si_mode = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

In [17]:
# This will find the mode for each column in the dataframe
df.mode().iloc[0]

ID                               255.0
Income_of_Applicant               2500
Income_of_Joint_Applicant          0.0
Loan_Amount_Requirement          110.0
Loan_Amount_Term                 360.0
Credit_History                     1.0
Gender                            Male
Is_Married                         Yes
No_of_Dependents                     0
Level_of_Education            Graduate
IS_Self_Employed                    No
Area_of_Property             Semiurban
Loan_Status                          Y
Name: 0, dtype: object

In [18]:
si_data = df.copy(deep = True)
cat_cols = ['Loan_Amount_Term', 'Credit_History', 'Gender', 'No_of_Dependents', 'IS_Self_Employed', 'Level_of_Education', 'Area_of_Property']
si_data[cat_cols] = si_mode.fit_transform(si_data[cat_cols])
knn_data = si_data.copy(deep = True)
num_cols = ['Loan_Amount_Requirement']
si_data[num_cols] = si_mean.fit_transform(si_data[num_cols])

In [19]:
print(si_data.shape)
si_data.isna().sum()

(600, 13)


ID                           0
Income_of_Applicant          0
Income_of_Joint_Applicant    0
Loan_Amount_Requirement      0
Loan_Amount_Term             0
Credit_History               0
Gender                       0
Is_Married                   0
No_of_Dependents             0
Level_of_Education           0
IS_Self_Employed             0
Area_of_Property             0
Loan_Status                  0
dtype: int64

### KNN imputation for K = 1,2,3 for all numerical columns

In [20]:
from sklearn.impute import KNNImputer

knn1_data = knn_data.copy(deep = True)
knn2_data = knn_data.copy(deep = True)
knn3_data = knn_data.copy(deep = True)

# for K = 1
knn1_data[num_cols] = KNNImputer(n_neighbors = 1).fit_transform(knn_data[num_cols])
print(knn1_data.shape)

# for K = 2
knn2_data[num_cols] = KNNImputer(n_neighbors = 2).fit_transform(knn_data[num_cols])
print(knn2_data.shape)

# for K = 3
knn3_data[num_cols] = KNNImputer(n_neighbors = 3).fit_transform(knn_data[num_cols])
print(knn3_data.shape)

(600, 13)
(600, 13)
(600, 13)


In [21]:
# Missing values after KNN imputation

print(knn1_data.isna().sum())
print('-----')
print(knn2_data.isna().sum())
print('-----')
print(knn3_data.isna().sum())

ID                           0
Income_of_Applicant          0
Income_of_Joint_Applicant    0
Loan_Amount_Requirement      0
Loan_Amount_Term             0
Credit_History               0
Gender                       0
Is_Married                   0
No_of_Dependents             0
Level_of_Education           0
IS_Self_Employed             0
Area_of_Property             0
Loan_Status                  0
dtype: int64
-----
ID                           0
Income_of_Applicant          0
Income_of_Joint_Applicant    0
Loan_Amount_Requirement      0
Loan_Amount_Term             0
Credit_History               0
Gender                       0
Is_Married                   0
No_of_Dependents             0
Level_of_Education           0
IS_Self_Employed             0
Area_of_Property             0
Loan_Status                  0
dtype: int64
-----
ID                           0
Income_of_Applicant          0
Income_of_Joint_Applicant    0
Loan_Amount_Requirement      0
Loan_Amount_Term             0
C

## For categorical variables
### Method :- One-hot encoding

In [22]:
cat_cols

['Loan_Amount_Term',
 'Credit_History',
 'Gender',
 'No_of_Dependents',
 'IS_Self_Employed',
 'Level_of_Education',
 'Area_of_Property']

In [23]:
si_data = pd.get_dummies(data = si_data, columns = cat_cols)
si_data

  uniques = Index(uniques)


Unnamed: 0,ID,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Is_Married,Loan_Status,Loan_Amount_Term_12.0,Loan_Amount_Term_36.0,Loan_Amount_Term_60.0,Loan_Amount_Term_84.0,...,No_of_Dependents_1,No_of_Dependents_2,No_of_Dependents_3+,IS_Self_Employed_No,IS_Self_Employed_Yes,Level_of_Education_Graduate,Level_of_Education_Not Graduate,Area_of_Property_Rural,Area_of_Property_Semiurban,Area_of_Property_Urban
0,2,4583,1508.0,128.000000,Yes,N,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
1,8,3036,2504.0,158.000000,Yes,N,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0
2,10,12841,10968.0,349.000000,Yes,N,0,0,0,0,...,1,0,0,1,0,1,0,0,1,0
3,14,1853,2840.0,114.000000,No,N,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4,18,3510,0.0,76.000000,No,N,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,601,416,41667.0,350.000000,No,N,0,0,0,0,...,0,0,1,1,0,1,0,0,0,1
596,582,1836,33837.0,90.000000,No,N,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1
597,525,4680,2087.0,147.366319,No,N,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
598,519,4683,1915.0,185.000000,No,N,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0


In [24]:
knn1_data = pd.get_dummies(data = knn1_data, columns = cat_cols)
knn2_data = pd.get_dummies(data = knn2_data, columns = cat_cols)
knn3_data = pd.get_dummies(data = knn3_data, columns = cat_cols)

  uniques = Index(uniques)
  uniques = Index(uniques)
  uniques = Index(uniques)


In [25]:
# Dropping the ID column as it is of no relevance whatsoever

si_data.drop(['ID'], axis = 1, inplace = True)
knn1_data.drop(['ID'], axis = 1, inplace = True)
knn2_data.drop(['ID'], axis = 1, inplace = True)
knn3_data.drop(['ID'], axis = 1, inplace = True)

In [26]:
# Converting target variable into binary outputs
si_data['Loan_Status'] = si_data['Loan_Status'].apply(lambda x: 0 if x=='N' else 1)
si_data['Loan_Status'].value_counts()

1    340
0    260
Name: Loan_Status, dtype: int64

In [27]:
si_data.head()

Unnamed: 0,Income_of_Applicant,Income_of_Joint_Applicant,Loan_Amount_Requirement,Is_Married,Loan_Status,Loan_Amount_Term_12.0,Loan_Amount_Term_36.0,Loan_Amount_Term_60.0,Loan_Amount_Term_84.0,Loan_Amount_Term_120.0,...,No_of_Dependents_1,No_of_Dependents_2,No_of_Dependents_3+,IS_Self_Employed_No,IS_Self_Employed_Yes,Level_of_Education_Graduate,Level_of_Education_Not Graduate,Area_of_Property_Rural,Area_of_Property_Semiurban,Area_of_Property_Urban
0,4583,1508.0,128.0,Yes,0,0,0,0,0,0,...,1,0,0,1,0,1,0,1,0,0
1,3036,2504.0,158.0,Yes,0,0,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0
2,12841,10968.0,349.0,Yes,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,1,0
3,1853,2840.0,114.0,No,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
4,3510,0.0,76.0,No,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


In [28]:
knn1_data['Loan_Status'] = knn1_data['Loan_Status'].apply(lambda x: 0 if x=='N' else 1)
knn2_data['Loan_Status'] = knn2_data['Loan_Status'].apply(lambda x: 0 if x=='N' else 1)
knn3_data['Loan_Status'] = knn3_data['Loan_Status'].apply(lambda x: 0 if x=='N' else 1)
print(knn1_data.head())
print(knn2_data.head())
print(knn3_data.head())

   Income_of_Applicant  Income_of_Joint_Applicant  Loan_Amount_Requirement  \
0                 4583                     1508.0                    128.0   
1                 3036                     2504.0                    158.0   
2                12841                    10968.0                    349.0   
3                 1853                     2840.0                    114.0   
4                 3510                        0.0                     76.0   

  Is_Married  Loan_Status  Loan_Amount_Term_12.0  Loan_Amount_Term_36.0  \
0        Yes            0                      0                      0   
1        Yes            0                      0                      0   
2        Yes            0                      0                      0   
3         No            0                      0                      0   
4         No            0                      0                      0   

   Loan_Amount_Term_60.0  Loan_Amount_Term_84.0  Loan_Amount_Term_120.0  ...  \


## Train and Test Split (80:20)

In [29]:
X_si_data = si_data.drop(['Loan_Status'], axis = 1)
y_si_data = si_data['Loan_Status']
X_train_si_data, X_test_si_data, y_train_si_data, y_test_si_data = train_test_split(X_si_data, y_si_data, test_size=0.2, random_state=9)

In [30]:
X_knn1 = knn1_data.drop(['Loan_Status'], axis = 1)
y_knn1 = knn1_data['Loan_Status']
X_train_knn1, X_test_knn1, y_train_knn1, y_test_knn1 = train_test_split(X_knn1, y_knn1, test_size=0.2, random_state=42)

In [31]:
X_knn2 = knn2_data.drop(['Loan_Status'], axis = 1)
y_knn2 = knn2_data['Loan_Status']
X_train_knn2, X_test_knn2, y_train_knn2, y_test_knn2 = train_test_split(X_knn2, y_knn2, test_size=0.2, random_state=21)

In [32]:
X_knn3 = knn3_data.drop(['Loan_Status'], axis = 1)
y_knn3 = knn3_data['Loan_Status']
X_train_knn3, X_test_knn3, y_train_knn3, y_test_knn3 = train_test_split(X_knn3, y_knn3, test_size=0.2, random_state=66)

## Decision Tree Model

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [34]:
# Dataframe for storing metrics for different models

df_results = pd.DataFrame(columns = ['Model', 'Accuracy Score', 'Overall F1 Score'])
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score


In [35]:
dec_tree = DecisionTreeClassifier()
# Conver 'Is_Married' column to binary class values i.e. 0 or 1
X_train_si_data['Is_Married'] = X_train_si_data['Is_Married'].apply(lambda x: 0 if x=='N' else 1)
X_test_si_data['Is_Married'] = X_test_si_data['Is_Married'].apply(lambda x: 0 if x=='N' else 1)
# X_train_si_data, X_test_si_data, y_train_si_data, y_test_si_data
dec_tree_model = dec_tree.fit(X_train_si_data, y_train_si_data)
y_si_data_pred = dec_tree_model.predict(X_test_si_data)
acc_si_data = accuracy_score(y_test_si_data, y_si_data_pred)
f1_si_data = f1_score(y_test_si_data, y_si_data_pred)
print(classification_report(y_test_si_data, y_si_data_pred))

              precision    recall  f1-score   support

           0       0.84      0.83      0.84        65
           1       0.80      0.82      0.81        55

    accuracy                           0.82       120
   macro avg       0.82      0.82      0.82       120
weighted avg       0.83      0.82      0.83       120



In [36]:
df_results.loc[len(df_results.index)] = ['Decision Tree - Simple imputed', acc_si_data, f1_si_data]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811


In [37]:
# Conver 'Is_Married' column to binary class values i.e. 0 or 1
X_train_knn1['Is_Married'] = X_train_knn1['Is_Married'].apply(lambda x: 0 if x=='N' else 1)
X_test_knn1['Is_Married'] = X_test_knn1['Is_Married'].apply(lambda x: 0 if x=='N' else 1)

dec_tree_model_knn1 = dec_tree.fit(X_train_knn1, y_train_knn1)
y_knn1_pred = dec_tree_model_knn1.predict(X_test_knn1)
acc_knn1 = accuracy_score(y_test_knn1, y_knn1_pred)
f1_knn1 = f1_score(y_test_knn1, y_knn1_pred)
print(classification_report(y_test_knn1, y_knn1_pred))

              precision    recall  f1-score   support

           0       0.70      0.76      0.73        62
           1       0.72      0.66      0.68        58

    accuracy                           0.71       120
   macro avg       0.71      0.71      0.71       120
weighted avg       0.71      0.71      0.71       120



In [38]:
df_results.loc[len(df_results.index)] = ['Decision Tree - KNN (k=1)', acc_knn1, f1_knn1]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685


In [39]:
# Conver 'Is_Married' column to binary class values i.e. 0 or 1
X_train_knn2['Is_Married'] = X_train_knn2['Is_Married'].apply(lambda x: 0 if x=='N' else 1)
X_test_knn2['Is_Married'] = X_test_knn2['Is_Married'].apply(lambda x: 0 if x=='N' else 1)

dec_tree_model_knn2 = dec_tree.fit(X_train_knn2, y_train_knn2)
y_knn2_pred = dec_tree_model_knn2.predict(X_test_knn2)
acc_knn2 = accuracy_score(y_test_knn2, y_knn2_pred)
f1_knn2 = f1_score(y_test_knn2, y_knn2_pred)
print(classification_report(y_test_knn2, y_knn2_pred))

              precision    recall  f1-score   support

           0       0.77      0.61      0.68        61
           1       0.67      0.81      0.73        59

    accuracy                           0.71       120
   macro avg       0.72      0.71      0.71       120
weighted avg       0.72      0.71      0.71       120



In [40]:
df_results.loc[len(df_results.index)] = ['Decision Tree - KNN (k=2)', acc_knn2, f1_knn2]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824


In [41]:
# Conver 'Is_Married' column to binary class values i.e. 0 or 1
X_train_knn3['Is_Married'] = X_train_knn3['Is_Married'].apply(lambda x: 0 if x=='N' else 1)
X_test_knn3['Is_Married'] = X_test_knn3['Is_Married'].apply(lambda x: 0 if x=='N' else 1)

dec_tree_model_knn3 = dec_tree.fit(X_train_knn3, y_train_knn3)
y_knn3_pred = dec_tree_model_knn3.predict(X_test_knn3)
acc_knn3 = accuracy_score(y_test_knn3, y_knn3_pred)
f1_knn3 = f1_score(y_test_knn3, y_knn3_pred)
print(classification_report(y_test_knn3, y_knn3_pred))

              precision    recall  f1-score   support

           0       0.63      0.64      0.63        50
           1       0.74      0.73      0.73        70

    accuracy                           0.69       120
   macro avg       0.68      0.68      0.68       120
weighted avg       0.69      0.69      0.69       120



In [42]:
df_results.loc[len(df_results.index)] = ['Decision Tree - KNN (k=3)', acc_knn3, f1_knn3]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813


## Logistic Regression Model

In [43]:
lr = LogisticRegression(max_iter = 7600)

In [44]:
lr_si = lr.fit(X_train_si_data, y_train_si_data)
y_pred_si = lr_si.predict(X_test_si_data)
acc_lr_si = accuracy_score(y_test_si_data, y_pred_si)
f1_lr_si = f1_score(y_test_si_data, y_pred_si)
print(classification_report(y_test_si_data, y_pred_si))

              precision    recall  f1-score   support

           0       0.94      0.52      0.67        65
           1       0.63      0.96      0.76        55

    accuracy                           0.73       120
   macro avg       0.79      0.74      0.72       120
weighted avg       0.80      0.72      0.71       120



In [45]:
df_results.loc[len(df_results.index)] = ['Logistic Regression - Simple imputed', acc_lr_si, f1_lr_si]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259


In [46]:
lr_knn1 = lr.fit(X_train_knn1, y_train_knn1)
y_pred_knn1 = lr_knn1.predict(X_test_knn1)
acc_lr_knn1 = accuracy_score(y_test_knn1, y_pred_knn1)
f1_lr_knn1 = f1_score(y_test_knn1, y_pred_knn1)
print(classification_report(y_test_knn1, y_pred_knn1))

              precision    recall  f1-score   support

           0       0.96      0.42      0.58        62
           1       0.61      0.98      0.75        58

    accuracy                           0.69       120
   macro avg       0.79      0.70      0.67       120
weighted avg       0.79      0.69      0.67       120



In [47]:
df_results.loc[len(df_results.index)] = ['Logistic Regression - KNN (k=1)', acc_lr_knn1, f1_lr_knn1]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967


In [48]:
lr_knn2 = lr.fit(X_train_knn2, y_train_knn2)
y_pred_knn2 = lr_knn2.predict(X_test_knn2)
acc_lr_knn2 = accuracy_score(y_test_knn2, y_pred_knn2)
f1_lr_knn2 = f1_score(y_test_knn2, y_pred_knn2)
print(classification_report(y_test_knn2, y_pred_knn2))

              precision    recall  f1-score   support

           0       0.81      0.48      0.60        61
           1       0.62      0.88      0.73        59

    accuracy                           0.68       120
   macro avg       0.71      0.68      0.66       120
weighted avg       0.71      0.68      0.66       120



In [49]:
df_results.loc[len(df_results.index)] = ['Logistic Regression - KNN (k=2)', acc_lr_knn2, f1_lr_knn2]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273


In [50]:
lr_knn3 = lr.fit(X_train_knn3, y_train_knn3)
y_pred_knn3 = lr_knn3.predict(X_test_knn3)
acc_lr_knn3 = accuracy_score(y_test_knn3, y_pred_knn3)
f1_lr_knn3 = f1_score(y_test_knn3, y_pred_knn3)
print(classification_report(y_test_knn3, y_pred_knn3))

              precision    recall  f1-score   support

           0       0.81      0.42      0.55        50
           1       0.69      0.93      0.79        70

    accuracy                           0.72       120
   macro avg       0.75      0.67      0.67       120
weighted avg       0.74      0.72      0.69       120



In [51]:
df_results.loc[len(df_results.index)] = ['Logistic Regression - KNN (k=3)', acc_lr_knn3, f1_lr_knn3]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683


## KNN Model

In [52]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)

In [53]:
knn_si = knn.fit(X_train_si_data, y_train_si_data)
y_pred_knn_si = knn_si.predict(X_test_si_data)
acc_knn_si = accuracy_score(y_test_si_data, y_pred_knn_si)
f1_knn_si = f1_score(y_test_si_data, y_pred_knn_si)
print(classification_report(y_test_si_data, y_pred_knn_si))

              precision    recall  f1-score   support

           0       0.62      0.48      0.54        65
           1       0.51      0.65      0.58        55

    accuracy                           0.56       120
   macro avg       0.57      0.57      0.56       120
weighted avg       0.57      0.56      0.56       120



In [54]:
df_results.loc[len(df_results.index)] = ['KNN k=5 - Simple imputed', acc_knn_si, f1_knn_si]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576


In [55]:
knn_knn1 = knn.fit(X_train_knn1, y_train_knn1)
y_pred_knn_knn1 = knn_knn1.predict(X_test_knn1)
acc_knn_knn1 = accuracy_score(y_test_knn1, y_pred_knn_knn1)
f1_knn_knn1 = f1_score(y_test_knn1, y_pred_knn_knn1)
print(classification_report(y_test_knn1, y_pred_knn_knn1))

              precision    recall  f1-score   support

           0       0.62      0.47      0.53        62
           1       0.55      0.69      0.61        58

    accuracy                           0.57       120
   macro avg       0.58      0.58      0.57       120
weighted avg       0.58      0.57      0.57       120



In [56]:
df_results.loc[len(df_results.index)] = ['KNN k=5 - KNN (k=1)', acc_knn_knn1, f1_knn_knn1]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


In [57]:
knn_knn2 = knn.fit(X_train_knn2, y_train_knn2)
y_pred_knn_knn2 = knn_knn2.predict(X_test_knn2)
acc_knn_knn2 = accuracy_score(y_test_knn2, y_pred_knn_knn2)
f1_knn_knn2 = f1_score(y_test_knn2, y_pred_knn_knn2)
print(classification_report(y_test_knn2, y_pred_knn_knn2))

              precision    recall  f1-score   support

           0       0.57      0.33      0.42        61
           1       0.52      0.75      0.61        59

    accuracy                           0.53       120
   macro avg       0.54      0.54      0.51       120
weighted avg       0.54      0.53      0.51       120



In [58]:
df_results.loc[len(df_results.index)] = ['KNN k=5 - KNN (k=2)', acc_knn_knn2, f1_knn_knn2]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


In [59]:
knn_knn3 = knn.fit(X_train_knn3, y_train_knn3)
y_pred_knn_knn3 = knn_knn3.predict(X_test_knn3)
acc_knn_knn3 = accuracy_score(y_test_knn3, y_pred_knn_knn3)
f1_knn_knn3 = f1_score(y_test_knn3, y_pred_knn_knn3)
print(classification_report(y_test_knn3, y_pred_knn_knn3))

              precision    recall  f1-score   support

           0       0.56      0.60      0.58        50
           1       0.70      0.66      0.68        70

    accuracy                           0.63       120
   macro avg       0.63      0.63      0.63       120
weighted avg       0.64      0.63      0.63       120



In [60]:
df_results.loc[len(df_results.index)] = ['KNN k=5 - KNN (k=3)', acc_knn_knn3, f1_knn_knn3]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


## SVM Model

In [61]:
svc = SVC()

In [62]:
svm_si = svc.fit(X_train_si_data, y_train_si_data)
y_pred_svc_si = svm_si.predict(X_test_si_data)
acc_svc_si = accuracy_score(y_test_si_data, y_pred_svc_si)
f1_svc_si = f1_score(y_test_si_data, y_pred_svc_si)
print(classification_report(y_test_si_data, y_pred_svc_si))

              precision    recall  f1-score   support

           0       0.80      0.06      0.11        65
           1       0.47      0.98      0.64        55

    accuracy                           0.48       120
   macro avg       0.63      0.52      0.37       120
weighted avg       0.65      0.48      0.35       120



In [63]:
df_results.loc[len(df_results.index)] = ['SVM - Simple imputed', acc_svc_si, f1_svc_si]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


In [64]:
svm_knn1 = svc.fit(X_train_knn1, y_train_knn1)
y_pred_svc_knn1 = svm_knn1.predict(X_test_knn1)
acc_svc_knn1 = accuracy_score(y_test_knn1, y_pred_svc_knn1)
f1_svc_knn1 = f1_score(y_test_knn1, y_pred_svc_knn1)
print(classification_report(y_test_knn1, y_pred_svc_knn1))

              precision    recall  f1-score   support

           0       0.80      0.06      0.12        62
           1       0.50      0.98      0.66        58

    accuracy                           0.51       120
   macro avg       0.65      0.52      0.39       120
weighted avg       0.65      0.51      0.38       120



In [65]:
df_results.loc[len(df_results.index)] = ['SVM - KNN (k=1)', acc_svc_knn1, f1_svc_knn1]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


In [66]:
svm_knn2 = svc.fit(X_train_knn2, y_train_knn2)
y_pred_svc_knn2 = svm_knn2.predict(X_test_knn2)
acc_svc_knn2 = accuracy_score(y_test_knn2, y_pred_svc_knn2)
f1_svc_knn2 = f1_score(y_test_knn2, y_pred_svc_knn2)
print(classification_report(y_test_knn2, y_pred_svc_knn2))

              precision    recall  f1-score   support

           0       0.67      0.03      0.06        61
           1       0.50      0.98      0.66        59

    accuracy                           0.50       120
   macro avg       0.58      0.51      0.36       120
weighted avg       0.58      0.50      0.36       120



In [67]:
df_results.loc[len(df_results.index)] = ['SVM - KNN (k=2)', acc_svc_knn2, f1_svc_knn2]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


In [68]:
svm_knn3 = svc.fit(X_train_knn3, y_train_knn3)
y_pred_svc_knn3 = svm_knn3.predict(X_test_knn3)
acc_svc_knn3 = accuracy_score(y_test_knn3, y_pred_svc_knn3)
f1_svc_knn3 = f1_score(y_test_knn3, y_pred_svc_knn3)
print(classification_report(y_test_knn3, y_pred_svc_knn3))

              precision    recall  f1-score   support

           0       0.50      0.02      0.04        50
           1       0.58      0.99      0.73        70

    accuracy                           0.58       120
   macro avg       0.54      0.50      0.39       120
weighted avg       0.55      0.58      0.44       120



In [69]:
df_results.loc[len(df_results.index)] = ['SVM - KNN (k=3)', acc_svc_knn3, f1_svc_knn3]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


## Gradient Scores - Visual for :-
### 1. Decision Tree
### 2. Logistic Regression
### 3. KNN Model
### 4. SVM

In [70]:
df_results.style.background_gradient(cmap ="crest").set_properties(**{'font-size': '15px'})

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


## Decision Tree Classifier with Simple Imputed Data has the maximum 'Accuracy Score' as well as the maximum 'Overall F1 Score'.
### Accuracy Score = 82.5%
### Overall F1 Score = 81.08%

### ***For this classification problem, the KNN model would NOT be a good choice.
### ***Decision Tree and Logistic Regression give significantly better results than KNN or SVM

In [71]:
rf = RandomForestClassifier()

## Random Forest Classifier
### Along with Hyper Parameter Tuning

In [76]:
param_grid = {"n_estimators":[10,50,100], "criterion":["gini","entropy"], "max_features":["auto","sqrt","log2"], "random_state":[4], "class_weight":["balanced"]}
rf_model = GridSearchCV(rf, param_grid, cv=5, scoring="f1")
rf_model.fit(X_train_si_data,y_train_si_data)
y_pred_rf = rf_model.predict(X_test_si_data)
print(classification_report(y_test_si_data,y_pred_rf))
print(rf_model.best_params_)

              precision    recall  f1-score   support

           0       0.91      0.75      0.82        65
           1       0.76      0.91      0.83        55

    accuracy                           0.82       120
   macro avg       0.83      0.83      0.82       120
weighted avg       0.84      0.82      0.82       120

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 100, 'random_state': 4}


In [77]:
param_grid = {"n_estimators":[70, 100, 130], "criterion":["gini","entropy"], "max_features":["auto", "sqrt", "log2"], "random_state":[4], "class_weight":["balanced"]}
rf_model = GridSearchCV(rf, param_grid, cv=5, scoring="f1")
rf_model.fit(X_train_si_data,y_train_si_data)
y_pred_rf = rf_model.predict(X_test_si_data)
print(classification_report(y_test_si_data,y_pred_rf))
print(rf_model.best_params_)

              precision    recall  f1-score   support

           0       0.91      0.75      0.82        65
           1       0.76      0.91      0.83        55

    accuracy                           0.82       120
   macro avg       0.83      0.83      0.82       120
weighted avg       0.84      0.82      0.82       120

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 130, 'random_state': 4}


In [78]:
param_grid = {"n_estimators":[100, 125, 150], "criterion":["gini","entropy"], "max_features":["auto", "sqrt", "log2"], "random_state":[4], "class_weight":["balanced"]}
rf_model = GridSearchCV(rf, param_grid, cv=5, scoring="f1")
rf_model.fit(X_train_si_data,y_train_si_data)
y_pred_rf = rf_model.predict(X_test_si_data)
print(classification_report(y_test_si_data,y_pred_rf))
print(rf_model.best_params_)

              precision    recall  f1-score   support

           0       0.91      0.75      0.82        65
           1       0.76      0.91      0.83        55

    accuracy                           0.82       120
   macro avg       0.83      0.83      0.82       120
weighted avg       0.84      0.82      0.82       120

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 125, 'random_state': 4}


In [79]:
param_grid = {"n_estimators":[115, 125, 135], "criterion":["gini","entropy"], "max_features":["auto", "sqrt", "log2"], "random_state":[4], "class_weight":["balanced"]}
rf_model = GridSearchCV(rf, param_grid, cv=5, scoring="f1")
rf_model.fit(X_train_si_data,y_train_si_data)
y_pred_rf = rf_model.predict(X_test_si_data)
print(classification_report(y_test_si_data,y_pred_rf))
print(rf_model.best_params_)


              precision    recall  f1-score   support

           0       0.91      0.75      0.82        65
           1       0.76      0.91      0.83        55

    accuracy                           0.82       120
   macro avg       0.83      0.83      0.82       120
weighted avg       0.84      0.82      0.82       120

{'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 125, 'random_state': 4}


In [80]:
acc_rf_si = accuracy_score(y_test_si_data, y_pred_rf)
f1_rf_si = f1_score(y_test_si_data, y_pred_rf)
acc_rf_si, f1_rf_si

(0.825, 0.8264462809917356)

In [81]:
df_results.loc[len(df_results.index)] = ['Random Forest', acc_rf_si, f1_rf_si]
df_results

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


In [82]:
df_results.style.background_gradient(cmap ="crest").set_properties(**{'font-size': '15px'})

Unnamed: 0,Model,Accuracy Score,Overall F1 Score
0,Decision Tree - Simple imputed,0.825,0.810811
1,Decision Tree - KNN (k=1),0.708333,0.684685
2,Decision Tree - KNN (k=2),0.708333,0.732824
3,Decision Tree - KNN (k=3),0.691667,0.733813
4,Logistic Regression - Simple imputed,0.725,0.76259
5,Logistic Regression - KNN (k=1),0.691667,0.754967
6,Logistic Regression - KNN (k=2),0.675,0.727273
7,Logistic Regression - KNN (k=3),0.716667,0.792683
8,KNN k=5 - Simple imputed,0.558333,0.576
9,KNN k=5 - KNN (k=1),0.575,0.610687


## After implementing Random Forest classifier with hyper parameter tuning and Cross-Validation

### 1. We can see that both Decision Tree and Random Forest give the same accuracy score of 82.5%
### 2. The Overall F1 score is higher for Random Forest at 82.64% so, we would recommend to use Random Forest Classifier.
### 3. Although both Decision Tree and Random Forest models are good alternatives as compared to Logistic Regression, KNN and SVM.