In [1]:
!pip install faker
import pandas as pd
import numpy as np
import random
from faker import Faker

fake = Faker()
np.random.seed(42)
random.seed(42)

num_rows = 5000

data = {
    "Applicant_ID": [f"APP{100000 + i}" for i in range(num_rows)],
    "Full_Name": [fake.name() for _ in range(num_rows)],
    "Age": np.random.randint(21, 60, size=num_rows),
    "Gender": np.random.choice(["Male", "Female"], size=num_rows, p=[0.7, 0.3]),
    "Marital_Status": np.random.choice(["Single", "Married", "Divorced", "Widowed"], size=num_rows, p=[0.3, 0.5, 0.1, 0.1]),
    "Dependents": np.random.randint(0, 5, size=num_rows),
    "Education_Level": np.random.choice(["Matric", "Intermediate", "Bachelors", "Masters", "PhD"], size=num_rows, p=[0.1, 0.3, 0.4, 0.15, 0.05]),
    "Employment_Status": np.random.choice(["Employed", "Self-Employed", "Unemployed", "Student"], size=num_rows, p=[0.5, 0.3, 0.15, 0.05]),
    "Occupation": np.random.choice(["Engineer", "Teacher", "Doctor", "Businessperson", "Laborer", "IT Professional", "Other"], size=num_rows),
    "Annual_Income": np.random.randint(200000, 3000000, size=num_rows),
    "Existing_Loans_Count": np.random.randint(0, 5, size=num_rows),
    "Existing_Loan_Amount": np.random.randint(0, 1500000, size=num_rows),
    "Credit_Score": np.random.randint(300, 850, size=num_rows),
    "Loan_Amount_Requested": np.random.randint(50000, 2500000, size=num_rows),
    "Loan_Term_Months": np.random.choice([12, 24, 36, 48, 60, 120, 180], size=num_rows),
    "Purpose_of_Loan": np.random.choice(["Home", "Car", "Business", "Education", "Wedding", "Medical", "Other"], size=num_rows),
    "Collateral_Value": np.random.randint(0, 3000000, size=num_rows),
    "Collateral_Type": np.random.choice(["Property", "Vehicle", "Jewelry", "None"], size=num_rows, p=[0.5, 0.3, 0.15, 0.05]),
}

df = pd.DataFrame(data)
df["Monthly_Income"] = df["Annual_Income"] // 12

def generate_default_label(row):
    risk = 0.05
    if row["Credit_Score"] < 600: risk += 0.15
    if row["Existing_Loans_Count"] > 2: risk += 0.1
    if row["Collateral_Type"] == "None": risk += 0.1
    if row["Loan_Amount_Requested"] > row["Collateral_Value"]: risk += 0.1
    return int(np.random.rand() < risk)

df["Default"] = df.apply(generate_default_label, axis=1)

df.to_excel("Synthetic_Loan_Approval_Data.xlsx", index=False)



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_excel("Synthetic_Loan_Approval_Data.xlsx")

In [4]:
df.head(10)

Unnamed: 0,Applicant_ID,Full_Name,Age,Gender,Marital_Status,Dependents,Education_Level,Employment_Status,Occupation,Annual_Income,Existing_Loans_Count,Existing_Loan_Amount,Credit_Score,Loan_Amount_Requested,Loan_Term_Months,Purpose_of_Loan,Collateral_Value,Collateral_Type,Monthly_Income,Default
0,APP100000,Samantha Ross DVM,59,Male,Married,4,Bachelors,Employed,Businessperson,285240,2,118297,300,2092050,12,Wedding,563188,Vehicle,23770,0
1,APP100001,Gilbert Gould,49,Male,Married,3,Bachelors,Employed,Laborer,808665,4,132876,657,420938,120,Business,2060673,Property,67388,1
2,APP100002,Renee Riddle,35,Male,Married,2,Bachelors,Employed,Teacher,465594,1,78147,417,343413,120,Business,2555499,Property,38799,0
3,APP100003,Amy Hartman,28,Female,Married,3,Bachelors,Student,Other,2305570,1,457760,778,671852,120,Education,2510165,Vehicle,192130,0
4,APP100004,Patrick Johnson,41,Female,Married,1,Masters,Self-Employed,Engineer,2739086,2,834309,758,1825134,48,Education,2683663,Property,228257,0
5,APP100005,Laura Gibson,59,Male,Widowed,1,Bachelors,Unemployed,Laborer,966164,3,1302020,412,2109746,120,Wedding,2760449,Vehicle,80513,0
6,APP100006,Ashley Johnson,39,Male,Married,4,Bachelors,Employed,Other,1607980,3,960160,494,1328906,24,Car,925134,Vehicle,133998,1
7,APP100007,James Fisher,43,Female,Married,2,Matric,Self-Employed,Teacher,1100441,0,1464629,319,1858693,48,Business,2707141,Vehicle,91703,0
8,APP100008,Michele Choi,31,Female,Single,0,Bachelors,Employed,Engineer,2300307,2,315535,585,2101941,24,Education,2147222,Property,191692,0
9,APP100009,Christopher Bryant,31,Male,Single,2,Masters,Employed,Businessperson,1650902,3,13229,757,1587204,120,Car,92541,Vehicle,137575,0


In [5]:
df.isnull().sum()

Applicant_ID               0
Full_Name                  0
Age                        0
Gender                     0
Marital_Status             0
Dependents                 0
Education_Level            0
Employment_Status          0
Occupation                 0
Annual_Income              0
Existing_Loans_Count       0
Existing_Loan_Amount       0
Credit_Score               0
Loan_Amount_Requested      0
Loan_Term_Months           0
Purpose_of_Loan            0
Collateral_Value           0
Collateral_Type          256
Monthly_Income             0
Default                    0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,Age,Dependents,Annual_Income,Existing_Loans_Count,Existing_Loan_Amount,Credit_Score,Loan_Amount_Requested,Loan_Term_Months,Collateral_Value,Monthly_Income,Default
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,40.1946,1.97,1619033.0,2.0116,748802.7,573.933,1264386.0,67.6824,1502526.0,134918.9326,0.2162
std,11.210025,1.413046,809829.2,1.411901,437186.1,157.263564,704421.3,55.540984,857917.3,67485.767751,0.411693
min,21.0,0.0,200116.0,0.0,545.0,300.0,50245.0,12.0,580.0,16676.0,0.0
25%,31.0,1.0,917455.8,1.0,368494.8,439.0,659565.2,24.0,772115.2,76454.25,0.0
50%,40.0,2.0,1632270.0,2.0,755564.0,575.0,1256890.0,48.0,1499304.0,136022.0,0.0
75%,50.0,3.0,2326675.0,3.0,1125991.0,709.0,1872924.0,120.0,2236046.0,193889.25,0.0
max,59.0,4.0,2999931.0,4.0,1499968.0,849.0,2499727.0,180.0,2999748.0,249994.0,1.0


In [7]:
df.drop(["Applicant_ID", "Full_Name"], axis=1, inplace=True)

In [8]:
df.fillna(method="ffill", inplace=True)

In [9]:
df.head(10)

Unnamed: 0,Age,Gender,Marital_Status,Dependents,Education_Level,Employment_Status,Occupation,Annual_Income,Existing_Loans_Count,Existing_Loan_Amount,Credit_Score,Loan_Amount_Requested,Loan_Term_Months,Purpose_of_Loan,Collateral_Value,Collateral_Type,Monthly_Income,Default
0,59,Male,Married,4,Bachelors,Employed,Businessperson,285240,2,118297,300,2092050,12,Wedding,563188,Vehicle,23770,0
1,49,Male,Married,3,Bachelors,Employed,Laborer,808665,4,132876,657,420938,120,Business,2060673,Property,67388,1
2,35,Male,Married,2,Bachelors,Employed,Teacher,465594,1,78147,417,343413,120,Business,2555499,Property,38799,0
3,28,Female,Married,3,Bachelors,Student,Other,2305570,1,457760,778,671852,120,Education,2510165,Vehicle,192130,0
4,41,Female,Married,1,Masters,Self-Employed,Engineer,2739086,2,834309,758,1825134,48,Education,2683663,Property,228257,0
5,59,Male,Widowed,1,Bachelors,Unemployed,Laborer,966164,3,1302020,412,2109746,120,Wedding,2760449,Vehicle,80513,0
6,39,Male,Married,4,Bachelors,Employed,Other,1607980,3,960160,494,1328906,24,Car,925134,Vehicle,133998,1
7,43,Female,Married,2,Matric,Self-Employed,Teacher,1100441,0,1464629,319,1858693,48,Business,2707141,Vehicle,91703,0
8,31,Female,Single,0,Bachelors,Employed,Engineer,2300307,2,315535,585,2101941,24,Education,2147222,Property,191692,0
9,31,Male,Single,2,Masters,Employed,Businessperson,1650902,3,13229,757,1587204,120,Car,92541,Vehicle,137575,0


In [10]:
X = df.drop("Default", axis=1)
y = df["Default"]

In [11]:
y.head(5)

0    0
1    1
2    0
3    0
4    0
Name: Default, dtype: int64

In [12]:
cat_cols = X.select_dtypes(include="object").columns

In [13]:

label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
model = LogisticRegression(
    penalty='l2',        # Regularization type (ridge)
    C=1.0,               # Inverse of regularization strength
    solver='lbfgs',      # Optimization algorithm
    max_iter=1000,       # Max iterations to converge
    class_weight='balanced',  # Adjust for class imbalance
    random_state=42      # Ensures consistent results
)

In [17]:
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [19]:
print(f"\n✅ Model Accuracy: {accuracy * 100:.2f}%\n")
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🔎 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


✅ Model Accuracy: 56.70%

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.56      0.67       784
           1       0.27      0.60      0.38       216

    accuracy                           0.57      1000
   macro avg       0.55      0.58      0.52      1000
weighted avg       0.71      0.57      0.61      1000

🔎 Confusion Matrix:
 [[437 347]
 [ 86 130]]
