In [14]:

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns


np.random.seed(42)
n = 100_000

geolocation = np.random.choice(['Urban', 'Rural'], size=n, p=[0.35, 0.65])

loan_amount = np.clip(np.random.lognormal(np.log(1e5), 0.5, n), 10_000, 500_000).astype(int)
income = np.clip(np.random.lognormal(np.log(27_500), 0.5, n), 5_000, 200_000).astype(int)
loan_to_income = (loan_amount / income).round(2)

psych_score = np.clip(np.random.normal(60, 15, n), 0, 100).round(2)


purpose = np.random.choice(
    ['Medical', 'Wedding', 'Home Renovation', 'Festival', 'Debt Consolidation',
     'Education', 'Business', 'Miscellaneous'],
    size=n,
    p=[0.29, 0.22, 0.14, 0.09, 0.08, 0.07, 0.06, 0.05]
)


utility_delay = np.array([
    np.random.poisson(1.5) if g == 'Urban' else np.random.poisson(3.0)
    for g in geolocation
])
utility_delay = np.clip(utility_delay, 0, 30)
high_utility_stress = (utility_delay > 6).astype(int)

upi_volume = np.array([
    np.random.negative_binomial(30, 0.4) if g == 'Urban' else np.random.negative_binomial(10, 0.4)
    for g in geolocation
])
upi_volume = np.clip(upi_volume, 0, 250)

data_usage = np.array([
    np.random.normal(26, 10) if g == 'Urban' else np.random.normal(12, 5)
    for g in geolocation
])
data_usage = np.clip(data_usage, 5, 60).round(2)

call_count = np.array([
    np.random.normal(250, 150) if g == 'Urban' else np.random.normal(350, 150)
    for g in geolocation
])
call_count = np.clip(call_count, 0, 1000).astype(int)

online_purchases = np.array([
    np.random.poisson(5) if g == 'Urban' else np.random.poisson(1)
    for g in geolocation
])
online_purchases = np.clip(online_purchases, 0, 15)

# Demographics
employment_status = np.random.choice(['Salaried', 'Self-employed', 'Unemployed'], size=n, p=[0.21, 0.57, 0.22])
education = np.random.choice(['None', 'Primary', 'Secondary', 'Graduate', 'Postgraduate'], size=n,
                             p=[0.10, 0.25, 0.35, 0.20, 0.10])


upi_scaled = MinMaxScaler().fit_transform(upi_volume.reshape(-1, 1)).flatten()
online_scaled = MinMaxScaler().fit_transform(online_purchases.reshape(-1, 1)).flatten()

digital_activity = (
    0.4 * upi_scaled +
    0.3 * online_scaled +
    0.3 * (data_usage / 60).clip(0, 1)
).round(3)

behavioral_risk = (
    (psych_score < 50) &
    (digital_activity < 0.3) &
    (employment_status == 'Unemployed')
).astype(int)


default_prob = (
    0.25 * (loan_to_income > 0.4).astype(float) +
    0.15 * (psych_score < 50).astype(float) +
    0.08 * (purpose == 'Debt Consolidation').astype(float) +
    0.05 * (purpose == 'Education').astype(float) +
    0.10 * (utility_delay > 5).astype(float) +
    0.08 * high_utility_stress +
    0.10 * (upi_volume < 10).astype(float) +
    0.05 * (data_usage < 10).astype(float) +
    0.05 * (online_purchases < 2).astype(float) +
    0.10 * (employment_status == 'Unemployed').astype(float) +
    0.10 * (geolocation == 'Rural').astype(float) +
    0.10 * (digital_activity < 0.3).astype(float) +
    0.15 * behavioral_risk
)
default = (np.random.rand(n) < np.clip(default_prob, 0, 1)).astype(int)


df = pd.DataFrame({
    'LoanAmount': loan_amount,
    'MonthlyIncome': income,
    'LoanToIncomeRatio': loan_to_income,
    'PsychometricScore': psych_score,
    'PurposeOfLoan': purpose,
    'UtilityPaymentDelay': utility_delay,
    'HighUtilityStress': high_utility_stress,
    'UPITransactionVolume': upi_volume,
    'UPI_Scaled': upi_scaled.round(3),
    'DataUsageGB': data_usage,
    'MonthlyCallCount': call_count,
    'OnlinePurchases': online_purchases,
    'OnlinePurchases_Scaled': online_scaled.round(3),
    'DigitalActivityScore': digital_activity,
    'EmploymentStatus': employment_status,
    'EducationLevel': education,
    'Geolocation': geolocation,
    'BehavioralRiskFlag': behavioral_risk,
    'LoanDefault': default
})



print("Dataset generated. Shape:", df.shape)



cat_cols = ['PurposeOfLoan', 'EmploymentStatus', 'EducationLevel', 'Geolocation']
df[cat_cols] = df[cat_cols].fillna("Unknown")

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


df_sampled = df.sample(n=10000, random_state=42)
X = df_sampled.drop("LoanDefault", axis=1)
y = df_sampled["LoanDefault"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
rf.fit(X_train, y_train)


y_pred = rf.predict(X_test)

print(" Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Dataset generated. Shape: (100000, 19)
 Accuracy: 0.6265

Confusion Matrix:
 [[497 478]
 [269 756]]

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.51      0.57       975
           1       0.61      0.74      0.67      1025

    accuracy                           0.63      2000
   macro avg       0.63      0.62      0.62      2000
weighted avg       0.63      0.63      0.62      2000

