<a href="https://colab.research.google.com/github/bsaha205/Fall_22_PML/blob/main/PML_HW_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploring data

In [41]:
# import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.preprocessing import Normalizer
import warnings
warnings.filterwarnings("ignore")

In [42]:
# data source: https://www.kaggle.com/datasets/vikasukani/loan-eligible-dataset
# note: I have run only Training data
df = pd.read_csv('loan.csv')

In [43]:
# viewing the first few rows of the data
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [44]:
# what's the data size?
df.shape

(614, 13)

In [45]:
# general information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# Missing Value Computation

In [46]:
# which columns have a missing value?
df.isnull().any(axis=0) #axis 0 is rows so those are compressed

Loan_ID              False
Gender                True
Married               True
Dependents            True
Education            False
Self_Employed         True
ApplicantIncome      False
CoapplicantIncome    False
LoanAmount            True
Loan_Amount_Term      True
Credit_History        True
Property_Area        False
Loan_Status          False
dtype: bool

In [47]:
# how many rows have a missing value?
df.isnull().any(axis=1).sum()

134

In [48]:
# what percent of each column is missing?
df_nan = round(100*df.isna().sum()/df.shape[0],2)
index = df_nan.index

pd.DataFrame({ '% NaN': df_nan}, index=index).T

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
% NaN,0.0,2.12,0.49,2.44,0.0,5.21,0.0,0.0,3.58,2.28,8.14,0.0,0.0


In [49]:
# how many rows of each column is missing?
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

#### Gender

We see there are 13 missing values in gender column. We can replace them with the value based on mean ApplicantIncome for each gender like below.

In [50]:
female_mean_income = df.groupby('Gender').mean()['ApplicantIncome']['Female']
male_mean_income = df.groupby('Gender').mean()['ApplicantIncome']['Male']

for i in range(df.shape[0]):
  if pd.isnull(df.at[i,'Gender']):
    replaced_value = 'Male'
    income_diff_with_male = abs(male_mean_income - df.at[i,'ApplicantIncome'])
    income_diff_with_female = abs(female_mean_income - df.at[i,'ApplicantIncome'])
    if(income_diff_with_female < income_diff_with_male):
      replaced_value = 'Female'
    # print("Replacing with", replaced_value)
    df.at[i,'Gender'] = replaced_value
    # df.at(i, 'Gender', replaced_value)

# df.isna().sum()
# df

### Married

We see there are only 3 missing values in Married column. We can replace them with the mode value.

In [51]:
# replace missing Married values with the mode
df.Married.fillna(df.Married.mode()[0], inplace=True)

## Dependents

In [52]:
# replace missing Dependents values with 0
df.Dependents.fillna(0, inplace=True)


## Self_Employed, Credit_History

We can replace these missing values with the value based on mean ApplicantIncome like what we did for gender.

In [53]:
# Self_Employed
no_mean_income = df.groupby('Self_Employed').mean()['ApplicantIncome']['No']
yes_mean_income = df.groupby('Self_Employed').mean()['ApplicantIncome']['Yes']

for i in range(df.shape[0]):
  if pd.isnull(df.at[i,'Self_Employed']):
    replaced_value = 'No'
    income_diff_with_no = abs(no_mean_income - df.at[i,'ApplicantIncome'])
    income_diff_with_yes = abs(yes_mean_income - df.at[i,'ApplicantIncome'])
    if(income_diff_with_yes < income_diff_with_no):
      replaced_value = 'Yes'
    df.at[i,'Self_Employed'] = replaced_value

In [54]:
# Credit_History
zero_mean_income = df.groupby('Credit_History').mean()['ApplicantIncome'][0]
one_mean_income = df.groupby('Credit_History').mean()['ApplicantIncome'][1]

for i in range(df.shape[0]):
  if pd.isnull(df.at[i,'Credit_History']):
    replaced_value = 0
    income_diff_with_zero = abs(zero_mean_income - df.at[i,'ApplicantIncome'])
    income_diff_with_one = abs(one_mean_income - df.at[i,'ApplicantIncome'])
    if(income_diff_with_one < income_diff_with_zero):
      replaced_value = 1
    df.at[i,'Credit_History'] = replaced_value

## LoanAmount, Loan_Amount_Term

We can replace these missing values simply with the mode value.

In [55]:
# replace missing LoanAmount values with the mode
df.LoanAmount.fillna(df.LoanAmount.mode()[0], inplace=True)

In [56]:
# replace missing Loan_Amount_Term values with the mode
df.Loan_Amount_Term.fillna(df.Loan_Amount_Term.mode()[0], inplace=True)

In [57]:
df_nan = round(100*df.isna().sum()/df.shape[0],2)
index = df_nan.index

pd.DataFrame({ '% NaN': df_nan}, index=index).T

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
% NaN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Wow! There is no missing value anymore!

# Make Dataset Ready

Now it is time to make the dataset ready for the model, for example replace all the categorized values with numerical values.

As we don't need Loan_ID for the logistic regression model, we can delete this entire column.

In [58]:
# deleting Loan_ID 
df = df.drop('Loan_ID', axis=1)

In [59]:
# replace male with 0 and female with 1
df['Gender'] = df['Gender'].replace(['Male'], 0)
df['Gender'] = df['Gender'].replace(['Female'], 1)

In [60]:
# replace married with 0 and un-married with 1
df['Married'] = df['Married'].replace(['Yes'], 0)
df['Married'] = df['Married'].replace(['No'], 1)

In [61]:
# replace Dependents of 3+ with 3
df['Dependents'] = df['Dependents'].replace(['3+'], 3)
df['Dependents'] = df['Dependents'].replace(['0'], 0)
df['Dependents'] = df['Dependents'].replace(['1'], 1)
df['Dependents'] = df['Dependents'].replace(['2'], 2)

In [62]:
# replace Education Graduate with 0 and Not Graduate with 1
df['Education'] = df['Education'].replace(['Graduate'], 0)
df['Education'] = df['Education'].replace(['Not Graduate'], 1)

In [63]:
# replace Self_Employed Yes with 0 and No with 1
df['Self_Employed'] = df['Self_Employed'].replace(['Yes'], 0)
df['Self_Employed'] = df['Self_Employed'].replace(['No'], 1)

In [64]:
# replace Property_Area Rural with 0, Semiurban with 1 and Urban with 2
df['Property_Area'] = df['Property_Area'].replace(['Rural'], 0)
df['Property_Area'] = df['Property_Area'].replace(['Semiurban'], 1)
df['Property_Area'] = df['Property_Area'].replace(['Urban'], 2)

In [65]:
# replace Loan_Status N with 0 and Y with 1
df['Loan_Status'] = df['Loan_Status'].replace(['N'], 0)
df['Loan_Status'] = df['Loan_Status'].replace(['Y'], 1)

# Task 1

Decision Tree Classifier Implementation

In [66]:
# independent variables
X = df.drop(['Loan_Status'], axis=1)
# dependent/target variable
y = df.Loan_Status

# divide data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the data between 0 and 1
transformer = Normalizer(norm='max').fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

# get the train data shape
n = X_train.shape[0]
d = X_train.shape[1]
print('n:', n, ' d:', d)

n: 491  d: 11


In [67]:
def get_scores(y_pred, y_test):
    # overall acuracy of the model
    accu_score = accuracy_score(y_pred, y_test)
    # f1 score of the model
    f1_scor = f1_score(y_pred, y_test)
    
    print("accuracy_score: %.4f, f1_score: %.4f" % (accu_score, f1_scor))
    return accu_score, f1_scor

In [73]:
from sklearn import tree

criterion_list = ['gini', 'entropy']
max_depth_list = [2, 4, 8]
min_samples_split_list = [2, 4, 8]
max_features_list = [4, 6, 8, 10]

iter = 1
optimum_criterion = None
optimum_max_depth = None
optimum_min_samples_split = None
optimum_max_features = None
max_accuracy_score = 0
max_f1_score = 0
for criterion in criterion_list:
  for max_depth in max_depth_list:
    for min_samples_split in min_samples_split_list:
      for max_features in max_features_list:
        print('iter:', iter, ', criterion:', criterion, ', max_depth:', max_depth, ', min_samples_split:', min_samples_split, ', max_features:', max_features)
        # Initialize our decision tree object
        classification_tree = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features)

        # Train our decision tree (tree induction + pruning)
        classification_tree = classification_tree.fit(X_train, y_train)

        # Predict
        y_pred = classification_tree.predict(X_test, check_input=True)
        accu_score, f1_scor = get_scores(y_pred, y_test)

        if accu_score > max_accuracy_score:
          max_accuracy_score = accu_score
          max_f1_score = f1_scor
          optimum_criterion = criterion
          optimum_max_depth = max_depth
          optimum_min_samples_split = min_samples_split
          optimum_max_features = max_features

        iter += 1

print('---------------Optimum results and configurations---------------')
print("max_accuracy_score: %.4f, max_f1_score: %.4f" % (max_accuracy_score, max_f1_score))
print('optimum_criterion:', optimum_criterion, ', optimum_max_depth:', optimum_max_depth, ', optimum_min_samples_split:', optimum_min_samples_split, ', optimum_max_features:', optimum_max_features)

iter: 1 , criterion: gini , max_depth: 2 , min_samples_split: 2 , max_features: 4
accuracy_score: 0.7805, f1_score: 0.8525
iter: 2 , criterion: gini , max_depth: 2 , min_samples_split: 2 , max_features: 6
accuracy_score: 0.7317, f1_score: 0.8272
iter: 3 , criterion: gini , max_depth: 2 , min_samples_split: 2 , max_features: 8
accuracy_score: 0.7805, f1_score: 0.8508
iter: 4 , criterion: gini , max_depth: 2 , min_samples_split: 2 , max_features: 10
accuracy_score: 0.7805, f1_score: 0.8508
iter: 5 , criterion: gini , max_depth: 2 , min_samples_split: 4 , max_features: 4
accuracy_score: 0.6667, f1_score: 0.7940
iter: 6 , criterion: gini , max_depth: 2 , min_samples_split: 4 , max_features: 6
accuracy_score: 0.6504, f1_score: 0.7882
iter: 7 , criterion: gini , max_depth: 2 , min_samples_split: 4 , max_features: 8
accuracy_score: 0.7805, f1_score: 0.8508
iter: 8 , criterion: gini , max_depth: 2 , min_samples_split: 4 , max_features: 10
accuracy_score: 0.7805, f1_score: 0.8508
iter: 9 , crit

I played with various values for criterion, max_depth, min_samples_split, max_features for the decision tree for total 72 configurations. All the results are printed above. It seems that criterion=gini, max_depth=4/8, min_samples_split=8 and max_features>=4 work better in most of the cases. Also the max_accuracy_score & max_f1_score have also been printed which are 0.7886 and 0.8539 respectivly for the configuration "optimum_criterion: gini , optimum_max_depth: 4, optimum_min_samples_split: 8 , optimum_max_features: 4"

# Task 2

In [69]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier

# define the model
model = BaggingClassifier(n_estimators=50)

# evaluate the model with accuracy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# evaluate the model with f1-scores
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
print('F1-score: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.766 (0.041)
F1-score: 0.847 (0.026)


In [70]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

# define the model
model = AdaBoostClassifier(n_estimators=50, random_state=0, algorithm='SAMME')

# evaluate the model with accuracy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# evaluate the model with f1-scores
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
print('F1-score: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.803 (0.028)
F1-score: 0.870 (0.018)


As we see, AdaBoostClassifier is working better than BaggingClassifier. Both classifier have been run with repeated k-folds with 5 repeats and 10 folds. So, it has total 50 accuracy & f1 scores. From those we print mean accuracy & f1 scores and the standard deviation of the scores.

# Task 3

From the above three implemented algoithms, we can see the results are as follows:

DecisionTreeClassifier: accuracy_score: 0.788, f1_score: 0.853.               
BaggingClassifier: accuracy_score: 0.771, f1_score: 0.843          
**AdaBoostClassifier**: accuracy_score: **0.803**, f1_score: **0.870**

AdaBoostClassifier works well in comparison to other two. I use accuracy & F1 scores for the comparison as theese scores are perfectly suitable the dataset that I am using for the classification problem. The dataset has 614 datapoints, 11 features and 2 classes with Y as 68% and N as 32% and in this case accuracy & F1 scores completely make sense to measure the effectiveness of the classification model. If the dataset is really imbalanced like Y as 90% and N as 10% then, accuracy would not work well as the metric for comparison.

In our case, we could also choose false negative score as a metric and could tell the model which has lowest false negative score is the best. Because, in our loan case, if an customer who is eligible to get a loan from the bank and if the model decides him/her as not eligible then the bank will loose a potential customer. But if a non-eligible person gets sometimes a loan (assuming the bank has all kind of identifications of the customer) from the bank, then it will not affect the business of the bank much. As recall_score works along with false negative score, I calculate recall score for of all of these three models described above. Higher recall score means better classifier. The results are given below:





In [82]:
criterion_list = ['gini', 'entropy']
max_depth_list = [2, 4, 8]
min_samples_split_list = [2, 4, 8]
max_features_list = [4, 6, 8, 10]

iter = 1
optimum_criterion = None
optimum_max_depth = None
optimum_min_samples_split = None
optimum_max_features = None
max_recall_score = 0
for criterion in criterion_list:
  for max_depth in max_depth_list:
    for min_samples_split in min_samples_split_list:
      for max_features in max_features_list:
        # print('iter:', iter, ', criterion:', criterion, ', max_depth:', max_depth, ', min_samples_split:', min_samples_split, ', max_features:', max_features)
        # Initialize our decision tree object
        classification_tree = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, max_features=max_features)

        # Train our decision tree (tree induction + pruning)
        classification_tree = classification_tree.fit(X_train, y_train)

        # Predict
        y_pred = classification_tree.predict(X_test, check_input=True)
        recall_scor = recall_score(y_pred, y_test)
        # print("recall_score: %.4f" % recall_scor)

        if recall_scor > max_recall_score:
          max_recall_score = recall_scor
          optimum_criterion = criterion
          optimum_max_depth = max_depth
          optimum_min_samples_split = min_samples_split
          optimum_max_features = max_features

        iter += 1

print('DecisionTreeClassifier:')
print("max_recall_score: %.4f" % max_recall_score)
print('optimum_criterion:', optimum_criterion, ', optimum_max_depth:', optimum_max_depth, ', optimum_min_samples_split:', optimum_min_samples_split, ', optimum_max_features:', optimum_max_features)

DecisionTreeClassifier:
max_recall_score: 0.8043
optimum_criterion: gini , optimum_max_depth: 8 , optimum_min_samples_split: 4 , optimum_max_features: 10


In [80]:
# define the model
model = BaggingClassifier(n_estimators=50)

# evaluate the model with accuracy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
print('BaggingClassifier:')
print('Recall: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

BaggingClassifier:
Recall: 0.900 (0.043)


In [81]:
# define the model
model = AdaBoostClassifier(n_estimators=50, random_state=0, algorithm='SAMME')

# evaluate the model with accuracy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
print('AdaBoostClassifier:')
print('Recall: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

AdaBoostClassifier:
Recall: 0.960 (0.027)


As we can see recall is the highest for AdaBoostClassifier again, we can tell AdaBoostClassifier is the best classifier among these three for this particular dataset and particular problem.