In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, auc, accuracy_score, precision_recall_curve
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
file_path = './dataset/loan_application.csv'  # Updated to the correct file path
loan_application_data = pd.read_csv(file_path)

loan_application_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Anomaly
0,Male,No,0,Graduate,No,5849,0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
94,Male,No,0,Not Graduate,No,3620,0,25.0,120.0,1.0,Semiurban,Y
95,Male,No,0,Graduate,,6782,0,,360.0,,Urban,N
96,Female,Yes,0,Graduate,No,2484,2302,137.0,360.0,1.0,Semiurban,Y
97,Male,Yes,0,Graduate,No,1977,997,50.0,360.0,1.0,Semiurban,Y


In [3]:
# Missing Value Treatment
for i in loan_application_data.columns:
    clas = loan_application_data[i].dtypes
    if clas == 'object':
        loan_application_data[i].fillna(loan_application_data[i].mode()[0], inplace=True)
    else:
        loan_application_data[i].fillna(loan_application_data[i].mean(), inplace=True)

loan_application_data.dtypes

Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Anomaly               object
dtype: object

In [4]:
# Applying LabelEncoder for converting object data types to integer
label_encoder = LabelEncoder()
loan_application_copy = loan_application_data.copy()
for i in loan_application_copy.columns:
    cls = loan_application_copy[i].dtypes
    if cls == 'object':
        loan_application_copy[i] = label_encoder.fit_transform(loan_application_copy[i].astype(str))
    else:
        loan_application_copy[i] = loan_application_copy[i]

loan_application_copy

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Anomaly
0,1,0,0,0,0,5849,0,134.255319,360.0,1.000000,2,1
1,1,1,1,0,0,4583,1508,128.000000,360.0,1.000000,0,0
2,1,1,0,0,1,3000,0,66.000000,360.0,1.000000,2,1
3,1,1,0,1,0,2583,2358,120.000000,360.0,1.000000,2,1
4,1,0,0,0,0,6000,0,141.000000,360.0,1.000000,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
94,1,0,0,1,0,3620,0,25.000000,120.0,1.000000,1,1
95,1,0,0,0,0,6782,0,134.255319,360.0,0.835165,2,0
96,0,1,0,0,0,2484,2302,137.000000,360.0,1.000000,1,1
97,1,1,0,0,0,1977,997,50.000000,360.0,1.000000,1,1


In [5]:
# Split the data into training and test sets
x = loan_application_copy.drop('Anomaly', axis=1)  # Assuming 'Anomaly' is the target variable
y = loan_application_copy['Anomaly']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

# Train the Logistic Regression model
classifier = LogisticRegression(max_iter=1000)  # Increased the number of iterations
classifier.fit(x_train, y_train)

# Predict on the test set
y_pred = classifier.predict(x_test)
y_predd = ["Anomaly" if i == 1 else "Not Anomaly" for i in y_pred]

# Move 'Anomaly' column to the first position
cols = ['Anomaly'] + [col for col in loan_application_copy if col != 'Anomaly']
loan_application_copy = loan_application_copy[cols]

loan_application_copy

Unnamed: 0,Anomaly,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,0,0,5849,0,134.255319,360.0,1.000000,2
1,0,1,1,1,0,0,4583,1508,128.000000,360.0,1.000000,0
2,1,1,1,0,0,1,3000,0,66.000000,360.0,1.000000,2
3,1,1,1,0,1,0,2583,2358,120.000000,360.0,1.000000,2
4,1,1,0,0,0,0,6000,0,141.000000,360.0,1.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...
94,1,1,0,0,1,0,3620,0,25.000000,120.0,1.000000,1
95,0,1,0,0,0,0,6782,0,134.255319,360.0,0.835165,2
96,1,0,1,0,0,0,2484,2302,137.000000,360.0,1.000000,1
97,1,1,1,0,0,0,1977,997,50.000000,360.0,1.000000,1


In [6]:
# Confusion Matrix and Accuracy
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(cm)
print(f'Accuracy: {accuracy}')

[[ 7  4]
 [ 0 14]]
Accuracy: 0.84


In [7]:
# Calculate probabilities, precision, recall, F1 score, and AUC
lr_probs = classifier.predict_proba(x_test)[:, 1]
lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)
lr_f1 = f1_score(y_test, y_pred)
lr_auc = auc(lr_recall, lr_precision)

print(f'Random Forest: f1={lr_f1:.3f} auc={lr_auc:.3f}')

Random Forest: f1=0.875 auc=0.836
