In [None]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_score
import pickle
%matplotlib inline

In [None]:
df = pd.read_csv("../data/loan.csv")
df

In [None]:
df.info()

In [None]:
# Total number of missing values present in each feature.
df.isna().sum()

In [None]:
# Percentage of missing values in each feature towards in the whole data set.
null_percent_df = pd.DataFrame(df.isna().sum() / df.shape[0] * 100)
null_percent_df

In [None]:
df['Gender'].fillna(df['Gender'].mode()[0], inplace = True)

In [None]:
v = df['Gender'].mode()[0]
v
# 'Male' value replaced all the null values in Gender feature

In [None]:
# This only considers the features which data type has numeric values
df.describe()

In [None]:
# This consider all features
df.describe(include='all')

In [None]:
df['Married'].fillna(df['Married'].mode()[0], inplace = True)

In [None]:
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace = True)

In [None]:
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace = True)

In [None]:
# Mean is preferred because it gives optimal value
# Mode can give the highest value if that's the maximum count
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace = True)

In [None]:
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace = True)

In [None]:
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace = True)

In [None]:
df['Gender'].replace({"Male" : 1, "Female" : 0}, inplace = True)

In [None]:
df['Married'].unique()

In [None]:
df['Married'].replace({"Yes" : 1, "No" : 0}, inplace = True)

In [None]:
df['Dependents'].unique()

In [None]:
df['Education'].unique()

In [None]:
df['Education'].replace({"Graduate" : 1, "Not Graduate" : 0}, inplace = True)

In [None]:
df['Self_Employed'].unique()

In [None]:
df['Self_Employed'].replace({"Yes" : 1, "No" : 0}, inplace = True)

In [None]:
df['Property_Area'].unique()

In [None]:
df['Loan_Status'].replace({"Y" : 1, "N" : 0}, inplace = True)

In [None]:
df.info()

In [None]:
df.isna().sum() / len(df) * 100

In [None]:
le = LabelEncoder()

In [None]:
df['Dependents'] = le.fit_transform(df['Dependents'])

In [None]:
df['Property_Area'] = le.fit_transform(df['Property_Area'])

In [None]:
le.classes_

In [None]:
le.transform(le.classes_)

In [None]:
le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
le_dict

In [None]:
df.info()

In [None]:
sns.boxplot(df['LoanAmount'])

In [None]:
# Histogram
# Right Skewed Data
df['LoanAmount'].hist(bins = 50)

In [None]:
# Power Transform
# To remove the skewness and get Normal Distribution
plt.hist(np.log(df['LoanAmount']))

In [None]:
# Log Transformation
df['log_LoanAmount'] = np.log(df['LoanAmount'])

In [None]:
sns.boxplot(df['log_LoanAmount'])

In [None]:
sns.distplot(df['log_LoanAmount'], kde = True)

In [None]:
df.columns

In [None]:
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']

In [None]:
sns.boxplot(df['TotalIncome'])

In [None]:
df['TotalIncome'].hist()

In [None]:
df['log_TotalIncome'] = np.log(df['TotalIncome'])

In [None]:
df.head()

In [None]:
df.drop(columns=['Loan_ID', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'TotalIncome'], axis = 1, inplace = True)

In [None]:
df.columns

In [None]:
df = df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 
              'Property_Area', 'log_LoanAmount', 'log_TotalIncome', 'Loan_Status']]

In [None]:
X = df.drop(columns = ['Loan_Status'])
y = df['Loan_Status']
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [None]:
%%time
xgbc = XGBClassifier(n_estimators = 120)
xgbc.fit(X_train, y_train)

In [None]:
y_pred_xgbc = xgbc.predict(X_test)
precision_xgbc = precision_score(y_test, y_pred_xgbc)
precision_xgbc

In [None]:
param_dist = {
    'learning_rate': uniform(0.01, 0.1), # range of values for learning rate
    'max_depth': randint(1, 10), # range of values for max depth
    'n_estimators': randint(10, 500), # range of values for number of estimators
    'gamma': uniform(0.01, 0.1), # range of values for gamma
    'subsample': uniform(0.1, 0.5), # range of values for subsample
    'colsample_bytree': uniform(0.1, 0.5) # range of values for colsample_bytree
}

In [None]:
xgbc_rscv = RandomizedSearchCV(
    estimator = xgbc,
    param_distributions = param_dist,
    n_iter = 100, # number of combinations of hyperparameters to try
    cv = 5, # number of cross-validation folds
    scoring = 'accuracy',
    n_jobs = -1 # number of parallel jobs to run (-1 means use all available cores)
)

In [None]:
%%time
xgbc_rscv.fit(X_train, y_train)

In [None]:
print('Best hyperparameters: ', xgbc_rscv.best_params_)
print('Best score: ', xgbc_rscv.best_score_)

In [None]:
best_learning_rate = xgbc_rscv.best_params_['learning_rate']
best_max_depth = xgbc_rscv.best_params_['max_depth']
best_n_estimators = xgbc_rscv.best_params_['n_estimators']
best_gamma = xgbc_rscv.best_params_['gamma']
best_subsample = xgbc_rscv.best_params_['subsample']
best_colsample_bytree = xgbc_rscv.best_params_['colsample_bytree']

In [None]:
xgb_classifier = XGBClassifier(
    learning_rate = best_learning_rate,
    max_depth = best_max_depth,
    n_estimators = best_n_estimators,
    gamma = best_gamma,
    subsample = best_subsample,
    colsample_bytree = best_colsample_bytree
)

In [None]:
xgb_classifier.fit(X_train, y_train)

In [None]:
y_pred = xgb_classifier.predict(X_test)

In [None]:
# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Accuracy:", accuracy * 100)

In [None]:
# confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", confusion_matrix)

In [None]:
sns.heatmap(confusion_matrix, annot = True, cmap = 'Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Classification Report
print('Classification Report:')
print(classification_report(y_test, y_pred))

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Convert the report to a DataFrame
df_report = pd.DataFrame(report).transpose()

# Plot the report
sns.heatmap(df_report.drop(['support'], axis=1), annot=True)
plt.show()

In [None]:
# ROC AUC Score
y_proba = xgbc_rscv.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)

plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

auc = roc_auc_score(y_test, y_proba)
print('AUC:', auc)

In [None]:
# Precision
precision = precision_score(y_test, y_pred)
print('Precision:', precision)

In [None]:
model = pickle.dump(xgbc_rscv, open('../models/xgboost.pkl', 'wb'))

In [None]:
# Load the trained classifier from the pickle file
with open('../models/xgboost.pkl', 'rb') as f:
    xgbc_rscv_model = pickle.load(f)
    
# Data

# new_data = pd.DataFrame([[1, 0, 0, 1, 1, 12, 0, 2, 1, 10]])
# new_data = pd.DataFrame([[1, 4, 9, 2, 0, 8, 6, 5, 7, 3]])
# new_data = pd.DataFrame([[1, 1, 0, 0, 0, 360, 1, 0, 4.5, 8.2],
#                          [0, 1, 1, 1, 1, 360, 1, 2, 4.2, 8.0],
#                          [1, 1, 0, 1, 0, 360, 1, 1, 4.3, 8.5]], 
#                         columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 
#                                  'Loan_Amount_Term', 'Credit_History', 'Property_Area', 
#                                  'log_LoanAmount', 'log_TotalIncome'])

new_data = pd.DataFrame([[1, 1, 0, 0, 0, 360, 1, 0, 4.5, 8.2]], 
                        columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 
                                 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 
                                 'log_LoanAmount', 'log_TotalIncome'])

# Make predictions on the new data
predictions = xgbc_rscv_model.predict(new_data)

# Print the predictions
print(predictions)