### Business Class : To find out based on given features whether the loan will get approved or not.

In [10]:
# Importing Library
!pip install preprocess
import pickle   # importing pickle for saving and loading machine learning models
import pandas as pd  # importing pandas for analyzing, cleaning, exploring, and manipulating data
from sklearn.model_selection import train_test_split  # importing train_test_split for spliting the data
from sklearn import preprocessing
from preprocess import *  # importing * for import all functions at once
from imblearn.over_sampling import SMOTE  # importing SMOTE for Balancing the Data



In [11]:
# Load the dataset
df=pd.read_csv("loan_approved.csv")  

In [12]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status (Approved)
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [13]:
# Checking the null values 

df.isnull().sum()  

Loan_ID                    0
Gender                    13
Married                    3
Dependents                15
Education                  0
Self_Employed             32
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                22
Loan_Amount_Term          14
Credit_History            50
Property_Area              0
Loan_Status (Approved)     0
dtype: int64

In [14]:
# Imputing null values 
df['Gender'].value_counts()  # Checking counts of unique values

Gender
Male      489
Female    112
Name: count, dtype: int64

In [15]:
df['Gender'].mode()

0    Male
Name: Gender, dtype: object

In [16]:
df.loc[df['Gender'].isnull(),'Gender']="Male"  # Filling null value using Most_Frequent Elements of Gender column.

In [17]:
df['Gender'].isnull().sum()

0

In [18]:
df['Gender'].unique()  # Checking unique values of Gender Columns

array(['Male', 'Female'], dtype=object)

In [19]:
df['Married'].value_counts()  # Checking counts of unique values

Married
Yes    398
No     213
Name: count, dtype: int64

In [20]:
df.loc[df['Married'].isnull(),'Married']="Yes"

In [21]:
df['Married'].unique() 

array(['No', 'Yes'], dtype=object)

In [22]:
df['Dependents'].value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [23]:
df.loc[df['Dependents'].isnull(),'Dependents']='0' 

In [24]:
df['Dependents'].unique() 

array(['0', '1', '2', '3+'], dtype=object)

In [25]:
df['Self_Employed'].value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [26]:
df.loc[df['Self_Employed'].isnull(),'Self_Employed']='No'

In [27]:
df['Self_Employed'].unique() 

array(['No', 'Yes'], dtype=object)

In [31]:
df.isnull().sum()

Loan_ID                    0
Gender                     0
Married                    0
Dependents                 0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term          14
Credit_History            50
Property_Area              0
Loan_Status (Approved)     0
dtype: int64

In [29]:
#  Filling null values using Median of LoanAmount
df.loc[df['LoanAmount'].isnull(),'LoanAmount']=df['LoanAmount'].median()

In [30]:
df['LoanAmount'].median()  # Checking median of LoanAmount

128.0

In [32]:
df['Loan_Amount_Term'].value_counts()   # Checking counts of unique values

Loan_Amount_Term
360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [33]:
#  Filling null values using Most_Frequent value of Loan_Amount_Term column
df.loc[df['Loan_Amount_Term'].isnull(),'Loan_Amount_Term']=360.0

In [34]:
df['Loan_Amount_Term'].value_counts() 

Loan_Amount_Term
360.0    526
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [35]:
df['Credit_History'].value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [36]:
df.loc[df['Credit_History'].isnull(),'Credit_History']=0.0

In [37]:
df.isnull().sum()   # Checking null values are available or not

Loan_ID                   0
Gender                    0
Married                   0
Dependents                0
Education                 0
Self_Employed             0
ApplicantIncome           0
CoapplicantIncome         0
LoanAmount                0
Loan_Amount_Term          0
Credit_History            0
Property_Area             0
Loan_Status (Approved)    0
dtype: int64

# Loading preprocessor

In [38]:
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

In [39]:
def divide_by_12(x):
    return (x/12)

In [40]:
custom=FunctionTransformer(divide_by_12)

In [41]:
def same(x):
    return x

In [42]:
no_trans=FunctionTransformer(same)

In [43]:
#This Python code defines a custom class ModifiedLabelEncoder,
#which extends the functionality of the LabelEncoder from the sklearn.preprocessing module.
class ModifiedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)

In [44]:
# Load a preprocessor object from a pickled file
with open("preprocessor.pkl","rb") as f:
    preprocessor=pickle.load(f)

In [45]:
preprocessor  # Preprocessor or PipeLine

# Splitting the data

In [46]:
# Spliting the data into train and test
train,test,_,_=train_test_split(df,df['LoanAmount'],test_size=0.2)

In [47]:
train['Loan_Status (Approved)'].value_counts()  # Checking count of unique values of Loan_Status (Approved) columns

Loan_Status (Approved)
Y    337
N    154
Name: count, dtype: int64

# Transforming the data

In [48]:
# Transform the training data using the preprocessor object or PipeLine
processed_data=preprocessor.fit_transform(train)

  y = column_or_1d(y, warn=True)


In [49]:
processed_data

array([[1., 0., 0., ..., 0., 1., 1.],
       [1., 0., 0., ..., 0., 1., 1.],
       [1., 0., 1., ..., 3., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 1.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 2., 1., 1.]])

In [50]:
# Extract the target variable (last column) from the processed data
y_train=processed_data[:,-1] # Last column is target

In [51]:
y_train

array([1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1.,
       0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0.,
       1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0.

In [52]:
# Extract the features (all columns except the last one) from the processed data
x_train=processed_data[:,:-1]

In [53]:
x_train

array([[1., 0., 0., ..., 1., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 1., 3., 1.],
       ...,
       [1., 0., 0., ..., 1., 0., 1.],
       [1., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 2., 1.]])

# Balancing the data

In [55]:
# SMOTE
# Initialize the SMOTE (Synthetic Minority Over-sampling Technique) object into variable
smote=SMOTE()

In [56]:
# Apply SMOTE to the training data to balance the class distribution
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)

In [57]:
len(y_train)  # Checking length of actual y_train

491

In [58]:
len(y_train_smote)   # Checking length of y_train after apply Smote

674

# Model Building 

In [59]:
from sklearn.svm import SVC  # # assign Support vector classifier
svclassifier = SVC() ## base model with default parameters
svclassifier.fit(x_train_smote,y_train_smote)  ## Fit the SVC to the resampled training data

# Validating the model

### Processing the test data

In [60]:
test_processed=preprocessor.transform(test) 

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [61]:
x_test=test_processed[:,:-1]  # Extract the features (all columns except the last one) from the processed data
y_test=test_processed[:,-1]   # Extract the target variable (last column) from the processed data

In [62]:
# Getting predictions from model
y_pred=svclassifier.predict(x_test)

In [63]:
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 1.])

In [64]:
y_test

array([0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 1., 1.])

# Evaluating the model performance

In [65]:
# Importing the classification_report function from sklearn.metrics
from sklearn.metrics import classification_report
# Printing the classification report comparing the true labels (y_test) and the predicted labels (y_pred)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.60      0.08      0.14        38
         1.0       0.70      0.98      0.82        85

    accuracy                           0.70       123
   macro avg       0.65      0.53      0.48       123
weighted avg       0.67      0.70      0.61       123



# Hyper Parameter Tuning

In [66]:
from itertools import product  # Importing the product function from itertools module

param_grid_linear = {           #  Define Parameter grid for linear kernel SVM
    'C': [0.1, 5, 10,50,60,70],
    'kernel': ['linear'],
    'gamma': ['scale', 'auto']
}
param_grid_rbf = {              # Define Parameter grid for Radial Basic Function-RBF
    'C': [0.1, 5, 10,50,60,70],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto']
}
param_grid_poly = {             # Define Parameters grid for Polynomial kernel
    'C': [0.1, 5, 10,50,60,70],
    'kernel': ['poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
}
#Scale : 1/no of features
#auto : 1/no of samples

In [None]:
# Importing GridSearchCV from sklearn
from sklearn.model_selection import GridSearchCV

# Assigning SVC model into variables
model=SVC()

# Defining the grid search using GridSearchCV
# - The parameter grid is defined by param_grid_poly
# - 'refit=True' ensures that the best estimator found during the grid search is refitted on the whole dataset
# - 'verbose=2' controls the verbosity of the grid search process (higher values result in more output)
# - 'scoring='f1'' specifies the scoring metric for evaluating the model's performance during grid search
# - 'cv=5' specifies 5-fold cross-validation for evaluating each combination of hyperparameters
grid = GridSearchCV(model,param_grid=param_grid_poly, refit = True, verbose = 2,scoring='f1',cv=5)

# fitting the model for grid search
grid.fit(x_train_smote,y_train_smote)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=2, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ...........C=0.1, degree=2, gamma=auto, kernel=poly; total time=   0.0s
[CV] END ...........C=0.1, degree=2, gamma=auto, kernel=poly; total time=   0.0s
[CV] END ...........C=0.1, degree=2, gamma=auto, kernel=poly; total time=   0.0s
[CV] END ...........C=0.1, degree=2, gamma=auto, kernel=poly; total time=   0.0s
[CV] END ...........C=0.1, degree=2, gamma=auto, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=3, gamma=scale, kernel=poly; total time=   0.0s
[CV] END ..........C=0.1, degree=3, gamma=scale

In [None]:
print(grid.best_params_)   # Printing the best parameters found by the grid search

In [None]:
param_grid

In [None]:
y_hat=grid.predict(x_test)  # Use the best model from grid search to make predictions on the test set

In [None]:
# Generate a classification report comparing true labels (y_test) with predicted labels (y_hat)
print(classification_report(y_test,y_hat))

In [None]:
# Opening a file named "model.pkl" in write-binary mode
# The 'wb' mode is used for writing binary data to the file
with open("model.pkl","wb") as f:
    pickle.dump(grid,f)      # Using pickle to serialize and save the grid search object to the file

In [None]:
# Conclusion