In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("../Data/loan.csv")
df

In [None]:
# Finding duplicate values
df[df.duplicated() == True]

In [None]:
df.info()

In [None]:
# Total number of missing values present in each feature.
df.isna().sum()

In [None]:
# Percentage of missing values in each feature towards in the whole data set.
df.isna().sum() / len(df) * 100

In [None]:
df.isna().sum() / df.shape[0] * 100

In [None]:
null_percent_df = pd.DataFrame(df.isna().sum() / df.shape[0] * 100)
null_percent_df

In [None]:
df['Gender'].fillna(df['Gender'].mode()[0], inplace = True)

In [None]:
# This only considers the features which data type has numeric values
df.describe()

In [None]:
v = df['Gender'].mode()[0]
v
# 'Male' value replaced all the null values in Gender feature

In [None]:
# This consider all features
df.describe(include='all')

In [None]:
df['Married'].fillna(df['Married'].mode()[0], inplace = True)

In [None]:
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace = True)

In [None]:
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace = True)

In [None]:
# Mean is preferred because it gives optimal value
# Mode can give the highest value if that's the maximum count
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace = True)

In [None]:
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace = True)

In [None]:
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace = True)

In [None]:
df['Gender'].replace({"Male" : 1, "Female" : 0}, inplace = True)

In [None]:
df['Married'].unique()

In [None]:
df['Married'].replace({"Yes" : 1, "No" : 0}, inplace = True)

In [None]:
df['Dependents'].unique()

In [None]:
df['Education'].unique()

In [None]:
df['Education'].replace({"Graduate" : 1, "Not Graduate" : 0}, inplace = True)

In [None]:
df['Self_Employed'].unique()

In [None]:
df['Self_Employed'].replace({"Yes" : 1, "No" : 0}, inplace = True)

In [None]:
df['Property_Area'].unique()

In [None]:
df['Loan_Status'].replace({"Y" : 1, "N" : 0}, inplace = True)

In [None]:
df.info()

In [None]:
df.isna().sum() / len(df) * 100

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
df['Dependents'] = le.fit_transform(df['Dependents'])

In [None]:
df['Property_Area'] = le.fit_transform(df['Property_Area'])

In [None]:
le.classes_

In [None]:
le.transform(le.classes_)

In [None]:
le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
le_dict

In [None]:
df.info()

In [None]:
import seaborn as sns

In [None]:
# To detect boxplot
sns.boxplot(df['LoanAmount'])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Histogram
# Right Skewed Data
df['LoanAmount'].hist(bins = 50)

In [None]:
# Power Transform
# To remove the skewness and get Normal Distribution
plt.hist(np.log(df['LoanAmount']))

In [None]:
df['log_LoanAmount'] = np.log(df['LoanAmount'])

In [None]:
sns.boxplot(df['log_LoanAmount'])

In [None]:
sns.distplot(df['log_LoanAmount'], kde = True)

In [None]:
df.columns

In [None]:
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

In [None]:
sns.boxplot(df['TotalIncome'])

In [None]:
df['TotalIncome'].hist()

In [None]:
df['log_TotalIncome'] = np.log(df['TotalIncome'])

In [None]:
df.head()

In [None]:
df.drop(columns=['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'TotalIncome'], axis = 1, inplace = True)

In [None]:
df.columns

In [None]:
df = df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 
              'Property_Area', 'log_LoanAmount', 'log_TotalIncome', 'Loan_Status']]

In [None]:
X = df.drop(columns = ['Loan_Status'])
y = df['Loan_Status']
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 35, stratify = y)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()
model = logreg.fit(X_train, y_train)
model

In [None]:
y_pred_logreg = model.predict(X_test)
y_pred_logreg[0 : 10]

In [None]:
y_test[0 : 10]

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score

In [None]:
accuracy_score(y_test, y_pred_logreg)

In [None]:
print(classification_report(y_test, y_pred_logreg))

In [None]:
confusion_matrix(y_test, y_pred_logreg)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10, stratify = y)

In [None]:
logreg = LogisticRegression()
model = logreg.fit(X_train, y_train)
y_pred_logreg = model.predict(X_test)
accuracy_score(y_test, y_pred_logreg)

In [None]:
precision_logreg = precision_score(y_test, y_pred_logreg)
precision_logreg

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_clf = DecisionTreeClassifier()

In [None]:
tree_clf.fit(X_train, y_train)

In [None]:
y_pred_tree_clf = tree_clf.predict(X_test)

In [None]:
y_pred_tree_clf[0 : 10]

In [None]:
y_test[0 : 10]

In [None]:
accuracy_score(y_test, y_pred_tree_clf)

In [None]:
precision_tree = precision_score(y_test, y_pred_tree_clf)
precision_tree

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
hyperparameter = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : np.arange(2, 15),
    'min_samples_split' : np.arange(2, 10),
    'min_samples_leaf' : np.arange(2, 10)
}

In [None]:
%%time
grid_tree_clf = GridSearchCV(tree_clf, hyperparameter, cv = 10, n_jobs = -1)
grid_tree_clf.fit(X_train, y_train)

In [None]:
hyperparameter = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : np.arange(2, 15),
    'min_samples_split' : np.arange(2, 10),
    'min_samples_leaf' : np.arange(2, 10)
}

In [None]:
%%time
grid_tree_clf = RandomizedSearchCV(tree_clf, hyperparameter, cv = 10, n_jobs = -1)
grid_tree_clf.fit(X_train, y_train)

In [None]:
grid_tree_clf.best_params_

In [None]:
y_pred_grid_tree_clf = grid_tree_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_grid_tree_clf)

In [None]:
precision_tree_hyper = precision_score(y_test, y_pred_grid_tree_clf)
precision_tree_hyper

In [None]:
from xgboost import XGBClassifier, XGBRFClassifier

In [None]:
help(XGBRFClassifier)

In [None]:
%%time
xgb = XGBClassifier(n_estimators = 120)
xgb.fit(X_train, y_train)

In [None]:
y_pred_xgb = xgb.predict(X_test)
precision_xgb = precision_score(y_test, y_pred_xgb)
precision_xgb

In [None]:
print(f"Logistic Regression: {precision_logreg}")
print(f"Decision Tree: {precision_tree}")
print(f"Decision Tree Hyper Parameter: {precision_tree_hyper}")
print(f"XGboost: {precision_xgb}")

In [None]:
print(f"Logistic Regression: {precision_logreg * 100}")
print(f"Decision Tree: {precision_tree * 100}")
print(f"Decision Tree Hyper Parameter: {precision_tree_hyper * 100}")
print(f"XGboost: {precision_xgb * 100}")

In [None]:
import pickle

In [None]:
model = pickle.dump(xgb, open('../Model/model.pkl', 'wb'))