In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from pandas_profiling import ProfileReport
import warnings
warnings.simplefilter('ignore')
%matplotlib inline
plt.switch_backend('agg')

  from pandas_profiling import ProfileReport


In [2]:
df = pd.read_csv("training_data.csv")

In [3]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Education'] = df['Education'].fillna(df['Education'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
# I'm applying mode b/c these are categorical values and mode preserves the distribution of the data and does not introduce bias in the central tendency.

In [5]:
df['Dependents'] = df['Dependents'].replace('3+', 3).astype(float) # To make the column readable by python
#Now we're doing label encoding to the categorical items
df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate': 0})
df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No': 0})
df['Married'] = df['Married'].map({'Yes': 1, 'No': 0})
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
df.tail(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
604,LP002959,0,1,1.0,1,0,12000,0.0,496.0,360.0,1.0,Semiurban,Y
605,LP002960,1,1,0.0,0,0,2400,3800.0,,180.0,1.0,Urban,N
606,LP002961,1,1,1.0,1,0,3400,2500.0,173.0,360.0,1.0,Semiurban,Y
607,LP002964,1,1,2.0,0,0,3987,1411.0,157.0,360.0,1.0,Rural,Y
608,LP002974,1,1,0.0,1,0,3232,1950.0,108.0,360.0,1.0,Rural,Y
609,LP002978,0,0,0.0,1,0,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,1,1,3.0,1,0,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,1,1,1.0,1,0,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,1,1,2.0,1,0,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,0,0,0.0,1,1,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [6]:
#Now only credit_history is remaining and I'll delete these missing values because this is the value on which my conclusion is based
df = df.dropna(subset=['Credit_History', 'Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term','Property_Area'])
# df = df[df['Credit_History'].notna()]

In [7]:


df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [8]:
rows_to_duplicate = df[df['Credit_History'] == 0]
duplicated_rows = rows_to_duplicate.copy()
df = pd.concat([df, duplicated_rows], ignore_index=True)
df.reset_index(drop=True, inplace=True)

In [9]:

ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [10]:
# df=pd.read_csv('Cleaned_Updated_LoanData.csv')
numeric_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
numeric_imputer = SimpleImputer(strategy='mean')# Impute missing values in numeric columns with mean
df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])
categorical_imputer = SimpleImputer(strategy='most_frequent')# Impute missing values in categorical columns with most frequent value
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])# Standardize numeric columns
encoder = OneHotEncoder(drop='first')
encoded_features = encoder.fit_transform(df[categorical_columns])# One-hot encode categorical columns

In [11]:
# Get the column names for the encoded categorical columns
encoded_columns = encoder.get_feature_names_out(categorical_columns)
# Create a DataFrame with the encoded features and column names
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoded_columns)
# Combine numeric and encoded categorical columns
final_df = pd.concat([df[numeric_columns], encoded_df], axis=1)
final_df.head(10)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_1,Married_1,Dependents_1.0,Dependents_2.0,Dependents_3.0,Education_1,Self_Employed_1,Property_Area_Semiurban,Property_Area_Urban
0,-0.144563,-0.022783,-0.218777,0.28673,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.369914,-0.627575,-0.944376,0.28673,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,-0.429276,0.318115,-0.312403,0.28673,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.057157,-0.627575,-0.066635,0.28673,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-0.025837,1.055255,1.40797,0.28673,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
5,-0.464866,-0.019574,-0.604983,0.28673,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,-0.364789,0.376669,0.13232,0.28673,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
7,-0.226703,-0.015564,0.249352,0.28673,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
8,1.031021,3.771204,2.367634,0.28673,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
9,-0.341442,-0.346836,-0.897563,0.28673,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [12]:
X = final_df  # Features
y = df['Credit_History']  # Target variable

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
param_grid = {
    'hidden_layer_sizes': [(200,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.00097, 0.00099, 0.0001]
}
mlp_classifier = MLPClassifier(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, scoring='precision', cv=3)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train.values.ravel())

# Get the best estimator
best_mlp_classifier = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(grid_search.best_params_)

# Normalize the testing data using the scaler fitted on training data
scaler = StandardScaler()
X_test_normalized = scaler.fit_transform(X_test)

# Predict on the normalized test set
y_pred = best_mlp_classifier.predict(X_test_normalized)

# Calculate the false positive rate
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100

# Print the false positive rate as a percentage of the whole
print("False Positives Rate: {:.2f}%".format(false_positive_rate))

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


Best Hyperparameters:
{'activation': 'relu', 'alpha': 0.00099, 'hidden_layer_sizes': (200,), 'solver': 'adam'}
False Positives Rate: 60.00%


In [15]:
# Decision Tree Classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
dt_classifier = DecisionTreeClassifier(max_depth=20, min_samples_split=2, min_samples_leaf=1, random_state=42)
# Max_depth to maximize information gain
dt_classifier.fit(X_train, y_train)
dt_classifier.random_state = 42
y_pred = dt_classifier.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Pastel1')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Decision Tree Classifier')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
classifiers = ['Decision Tree']
accuracy_scores = [accuracy]
precision_scores = [precision]
recall_scores = [recall]
plt.figure(figsize=(10, 6))
plt.subplot(1, 3, 1)
plt.bar(classifiers, accuracy_scores, color='skyblue')
plt.title('Accuracy')
plt.ylim(0, 1)
plt.subplot(1, 3, 2)
plt.bar(classifiers, precision_scores, color='salmon')
plt.title('Precision')
plt.ylim(0, 1)
plt.subplot(1, 3, 3)
plt.bar(classifiers, recall_scores, color='lightgreen')
plt.title('Recall')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
print("False Positive Rate: {:.2f}%".format(false_positive_rate))
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("Confusion Matrix:")
print(conf_matrix)

False Positive Rate: 23.33%
Accuracy: 0.78
Precision: 0.91
Recall: 0.79
Confusion Matrix:
[[23  7]
 [19 70]]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
logreg_classifier = LogisticRegression(C=0.81, penalty='l2', random_state=42, max_iter=1000)
logreg_classifier.random_state = 42
logreg_classifier.fit(X_train, y_train.values.ravel())
y_pred_proba = logreg_classifier.predict_proba(X_test)
thresholds_range = np.arange(0.70, 0.82, 0.01)

false_positive_rates = []
accuracy_scores = []
precision_scores = []
recall_scores = []

# Iterate through each threshold value
for threshold in thresholds_range:
    # Classify based on the current threshold
    y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

    # Compute confusion matrix, false positive rate, etc.
    conf_matrix = confusion_matrix(y_test, y_pred)
    false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100

    # Compute accuracy, precision, and recall
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Append results to lists
    false_positive_rates.append(false_positive_rate)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Print or store any additional information if needed

# Plot the results
plt.figure(figsize=(10, 6))

plt.subplot(1, 4, 1)
plt.plot(thresholds_range, false_positive_rates, marker='o', linestyle='-')
plt.title('False Positive Rate')
plt.xlabel('Threshold')
plt.ylabel('False Positive Rate')

plt.subplot(1, 4, 2)
plt.plot(thresholds_range, accuracy_scores, marker='o', linestyle='-')
plt.title('Accuracy')
plt.xlabel('Threshold')
plt.ylabel('Accuracy')

plt.subplot(1, 4, 3)
plt.plot(thresholds_range, precision_scores, marker='o', linestyle='-')
plt.title('Precision')
plt.xlabel('Threshold')
plt.ylabel('Precision')

plt.subplot(1, 4, 4)
plt.plot(thresholds_range, recall_scores, marker='o', linestyle='-')
plt.title('Recall')
plt.xlabel('Threshold')
plt.ylabel('Recall')

plt.tight_layout()
plt.show()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
logreg_classifier = LogisticRegression(C=0.81, penalty='l2', random_state=42, max_iter=1000)
logreg_classifier.random_state = 42
logreg_classifier.fit(X_train, y_train.values.ravel())
y_pred_proba = logreg_classifier.predict_proba(X_test)

# Define a new threshold
new_threshold = 0.75

y_pred = (y_pred_proba[:, 1] >= new_threshold).astype(int)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Logistic Regression Classifier')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Calculate and plot accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

classifiers = ['Logistic Regression']
accuracy_scores = [accuracy]
precision_scores = [precision]
recall_scores = [recall]

plt.figure(figsize=(10, 6))

plt.subplot(1, 3, 1)
plt.bar(classifiers, accuracy_scores, color='skyblue')
plt.title('Accuracy')
plt.ylim(0, 1)

plt.subplot(1, 3, 2)
plt.bar(classifiers, precision_scores, color='salmon')
plt.title('Precision')
plt.ylim(0, 1)

plt.subplot(1, 3, 3)
plt.bar(classifiers, recall_scores, color='lightgreen')
plt.title('Recall')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

# Print metrics and confusion matrix
print("Best Hyperparameters:")
print("C:", logreg_classifier.C)
print("Penalty:", logreg_classifier.penalty)
print("False Positive Rate: {:.2f}%".format(false_positive_rate))
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print(conf_matrix)


Best Hyperparameters:
C: 0.81
Penalty: l2
False Positive Rate: 60.00%
Accuracy: 0.61
Precision: 0.77
Recall: 0.67
[[12 18]
 [29 60]]


In [18]:
#Randomforest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
best_rf_classifier = RandomForestClassifier(random_state=42, n_estimators=250, max_depth=20, min_samples_split=2, min_samples_leaf=1)
best_rf_classifier.random_state = 42
best_rf_classifier.fit(X_train, y_train.values.ravel())# Fit the classifier to the training data
y_pred = best_rf_classifier.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Reds')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Random Forest Classifier')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
classifiers = ['Random Forest']
accuracy_scores = [accuracy]
precision_scores = [precision]
recall_scores = [recall]
plt.figure(figsize=(10, 6))
plt.subplot(1, 3, 1)
plt.bar(classifiers, accuracy_scores, color='skyblue')
plt.title('Accuracy')
plt.ylim(0, 1)
plt.subplot(1, 3, 2)
plt.bar(classifiers, precision_scores, color='salmon')
plt.title('Precision')
plt.ylim(0, 1)
plt.subplot(1, 3, 3)
plt.bar(classifiers, recall_scores, color='lightgreen')
plt.title('Recall')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
print("Best Hyperparameters:")
print("n_estimators:", best_rf_classifier.n_estimators)
print("max_depth:", best_rf_classifier.max_depth)
print("min_samples_split:", best_rf_classifier.min_samples_split)
print("min_samples_leaf:", best_rf_classifier.min_samples_leaf)
print("False Positive Rate: {:.2f}%".format(false_positive_rate))
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Best Hyperparameters:
n_estimators: 250
max_depth: 20
min_samples_split: 2
min_samples_leaf: 1
False Positive Rate: 20.00%
Accuracy: 0.94
Precision: 0.94
Recall: 0.99


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
best_hyperparameters = {'n_neighbors': 7, 'weights': 'distance'}
best_knn_classifier = KNeighborsClassifier(**best_hyperparameters)
best_knn_classifier.fit(X_train, y_train.values.ravel())
y_pred = best_knn_classifier.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='BuGn')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for KNeighborsClassifier')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
classifiers = ['KNeighbors']
accuracy_scores = [accuracy]
precision_scores = [precision]
recall_scores = [recall]
plt.figure(figsize=(10, 6))
plt.subplot(1, 3, 1)
plt.bar(classifiers, accuracy_scores, color='skyblue')
plt.title('Accuracy')
plt.ylim(0, 1)
plt.subplot(1, 3, 2)
plt.bar(classifiers, precision_scores, color='salmon')
plt.title('Precision')
plt.ylim(0, 1)
plt.subplot(1, 3, 3)
plt.bar(classifiers, recall_scores, color='lightgreen')
plt.title('Recall')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
print("Best Hyperparameters:")
print(best_hyperparameters)
print(conf_matrix)
print("False Positive Rate: {:.2f}%".format(false_positive_rate))
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Best Hyperparameters:
{'n_neighbors': 7, 'weights': 'distance'}
[[26  4]
 [12 77]]
False Positive Rate: 13.33%
Accuracy: 0.87
Precision: 0.95
Recall: 0.87


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)
best_hyperparameters = {'C': 22, 'gamma': 'scale', 'kernel': 'rbf'}
best_svm_classifier = SVC(**best_hyperparameters, random_state=42)
best_svm_classifier.fit(X_train, y_train.values.ravel())
y_pred = best_svm_classifier.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for SVM Classifier')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
classifiers = ['SVM']
accuracy_scores = [accuracy]
precision_scores = [precision]
recall_scores = [recall]
plt.figure(figsize=(10, 6))
plt.subplot(1, 3, 1)
plt.bar(classifiers, accuracy_scores, color='skyblue')
plt.title('Accuracy')
plt.ylim(0, 1)
plt.subplot(1, 3, 2)
plt.bar(classifiers, precision_scores, color='salmon')
plt.title('Precision')
plt.ylim(0, 1)
plt.subplot(1, 3, 3)
plt.bar(classifiers, recall_scores, color='lightgreen')
plt.title('Recall')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
print("Best Hyperparameters:")
print(best_hyperparameters)
print(conf_matrix)
print("False Positive Rate: {:.2f}%".format(false_positive_rate))
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

Best Hyperparameters:
{'C': 22, 'gamma': 'scale', 'kernel': 'rbf'}
[[12 18]
 [ 8 81]]
False Positive Rate: 60.00%
Accuracy: 0.78
Precision: 0.82
Recall: 0.91


In [21]:
#Naive Bayes classifier
# because of the class-imbalance the value of accuracy achieved is quite-low
nb_classifier = GaussianNB()
nb_classifier.random_state = 42
nb_classifier.fit(X_train, y_train.values.ravel())
y_pred = nb_classifier.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
false_positive_rate = conf_matrix[0, 1] / (conf_matrix[0, 1] + conf_matrix[0, 0]) * 100
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='No Skill')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve for Gaussian Naive Bayes Classifier')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
classifiers = ['Naive Bayes']
accuracy_scores = [accuracy]
precision_scores = [precision]
recall_scores = [recall]
plt.figure(figsize=(10, 6))
plt.subplot(1, 3, 1)
plt.bar(classifiers, accuracy_scores, color='skyblue')
plt.title('Accuracy')
plt.ylim(0, 1)
plt.subplot(1, 3, 2)
plt.bar(classifiers, precision_scores, color='salmon')
plt.title('Precision')
plt.ylim(0, 1)
plt.subplot(1, 3, 3)
plt.bar(classifiers, recall_scores, color='lightgreen')
plt.title('Recall')
plt.ylim(0, 1)
plt.tight_layout()
plt.show()
print(conf_matrix)
print("False Positive Rate: {:.2f}%".format(false_positive_rate))
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))

[[ 1 29]
 [ 8 81]]
False Positive Rate: 96.67%
Accuracy: 0.69
Precision: 0.74
Recall: 0.91
