ALL OF THE LIBRARIES THAT I'M GOING TO USE

In [29]:
import pandas as pd
import numpy as np
import pickle 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [30]:
# Reading the Dataset
full_set = pd.read_csv('./data/full_dataset.csv')
full_set.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [31]:
full_set.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

Looking at the data types in the dataset, everything looks okay.

In [32]:
full_set.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

Looking at the shape of the whole dataset.

In [33]:
print(full_set.shape)

(614, 13)


Let's drop the `Loan_ID` column since it is useless.

In [34]:
full_set.drop(['Loan_ID'], axis=1, inplace=True)
full_set.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Our dependent variable (y) `Loan_Status` is categorized by Y (Yes) and N (No).Let's numerize this column to `Y` being `1.0` and `N` being  `0.0` .

In [35]:
full_set['Loan_Status'] = full_set['Loan_Status'].map({'N':0,'Y':1})
full_set.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


Let's look at the missing values now.

In [36]:
full_set.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

It looks like we have some missing values. In order to prevent data leakage, I'm going to split data first, then use the training data to impute test data.

In [37]:
X_full = full_set.drop(["Loan_Status"], axis=1)
y_full = full_set["Loan_Status"]

In [38]:
X_full

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban


Train-Test split and looking at the shapes.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size  = 0.2, random_state=42)
print(f"Eğitim Kümesinin şekli: {X_train.shape}, \nTest Kümesinin Şekli: {X_test.shape}")

Eğitim Kümesinin şekli: (491, 11), 
Test Kümesinin Şekli: (123, 11)


Since there aren't many data in the trainingset, I'm going to do  a cross validation in order to get the most out of our data. Then I will use Grid Search to do fine tuning a Random Forest model.
But we have to turn our categorical data into numerical data and fill the missing values. For these data I'm going to do data preprocessing such as the numerical values will be scaledan the categorized data will be numerized by one-hot-encoding.

In order to prevent data leakage, all of these steps of preprocessing must be done within each step of cross validation.
I'm going to take the names of the numerical and categorical columns.


In [39]:
num_features = full_set.drop(['Loan_Status'], axis = 1).select_dtypes(include = 'number').columns
num_features

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [40]:
cat_features = full_set.select_dtypes(include = 'object').columns
cat_features

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

With the help of Pipeline, for numerical values, I'm going to impute the empty values with the mean and then scale them and for the categorical values I'm going to fill the missing values with a string 'missing'then numerize them.

In [41]:
num_transformer = Pipeline(steps = [('Imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
                                    ('MinMaxScaler', MinMaxScaler())])

cat_transformer = Pipeline(steps = [('Imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
                                    ('OneHotEncoder', OneHotEncoder(categories = 'auto', drop=None, handle_unknown = 'ignore'))])


Creating the ColumnTransformer

In [42]:
preprocessor = ColumnTransformer(transformers = [('num', num_transformer, num_features), 
                                                 ('cat', cat_transformer, cat_features)],
                                 remainder = 'drop',
                                 n_jobs = -1,
                                 verbose = False)

Now I can combine the preprocessing and the Random Forest model.

In [43]:
pipe = Pipeline(steps = [('preprocess', preprocessor),
                         ('RF_model', RandomForestClassifier(class_weight = "balanced", n_jobs=-1))],
                verbose=False)         

Creating a dictionary for the variables to use in GridSearch.

In [49]:
parameters_grid = [{'RF_model__n_estimators':[10, 20,30, 40,50],
                    'RF_model__max_features': ['sqrt', 'log2',0.125, 0.25, 0.5, 0.75],
                    'RF_model__max_depth' : [2,3, 4,5,6,7,8]}
                  ]

I'm going to do a 10-fold cross validation and look at the accuracy rates to compare them.

In [50]:
search = GridSearchCV(estimator = pipe, param_grid = parameters_grid, cv = 10, scoring = 'accuracy', return_train_score=False, verbose=1, n_jobs=-1)

Fitting the training set

In [51]:
best_model = search.fit(X_train, y_train)

Fitting 10 folds for each of 210 candidates, totalling 2100 fits


In [53]:
best_model.best_estimator_


Pipeline(steps=[('preprocess',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('num',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer()),
                                                                  ('MinMaxScaler',
                                                                   MinMaxScaler())]),
                                                  Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('Imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
           

Making predictions with the best model.

In [54]:
ytrain_pred = best_model.predict(X_train)
ytest_pred = best_model.predict(X_test)

Creating a function to look at the scores.

In [55]:
def TrainTestScores(y_train, y_train_pred, y_test, y_test_pred):
    
    scores = {"train_set": {"Accuracy" : accuracy_score(y_train, y_train_pred),
                            "Precision" : precision_score(y_train, y_train_pred),
                            "Recall" : recall_score(y_train, y_train_pred),                          
                            "F1 Score" : f1_score(y_train, y_train_pred),
                           "AUC": roc_auc_score(y_train, y_train_pred)},
    
              "test_set": {"Accuracy" : accuracy_score(y_test, y_test_pred),
                           "Precision" : precision_score(y_test, y_test_pred),
                           "Recall" : recall_score(y_test, y_test_pred),                          
                           "F1 Score" : f1_score(y_test, y_test_pred),
                          "AUC:": roc_auc_score(y_test, y_test_pred)}}
    
    return scores

In [57]:
TrainTestScores(y_train, ytrain_pred , y_test, ytest_pred)

{'train_set': {'Accuracy': 0.8207739307535642,
  'Precision': 0.8067632850241546,
  'Recall': 0.9766081871345029,
  'F1 Score': 0.8835978835978836,
  'AUC': 0.7198477177283253},
 'test_set': {'Accuracy': 0.7804878048780488,
  'Precision': 0.7572815533980582,
  'Recall': 0.975,
  'F1 Score': 0.8524590163934427,
  'AUC:': 0.6968023255813953}}

These scores are good enough to create web application with a Machine Learning Model on the background.

In [58]:
pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(best_model, pickle_out) 
pickle_out.close()

With the code above now we have saved our model and can use it anytime we want with the code below.

In [59]:
pickle_in = open('classifier.pkl', 'rb') 
classifier = pickle.load(pickle_in)