In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

In [3]:
# importing data form the source directory in google drive
# from google.colab import drive
# drive.mount('/content/drive')
# data_dir = '/content/drive/MyDrive/JPM_QR/'
# data_file = 'Loan_Data.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data_file = 'Loan_Data.csv'

In [4]:
loan_data = pd.read_csv(data_dir + data_file)

In [5]:
# Preprocessing
# Convert 'default' from boolean to integer if it's not already
loan_data['default'] = loan_data['default'].astype(int)

In [6]:
# Separate the features and the target variable
X = loan_data.drop(['customer_id', 'default'], axis=1)
y = loan_data['default']

In [7]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Feature scaling for more efficient model training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Train the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

In [10]:
# Function to predict the probability of default
def predict_default(features):
    # Scaling the feature values in the same way as the training data
    scaled_features = scaler.transform([features])
    # Predicting the probability of default [class 0, class 1]
    pd = logreg.predict_proba(scaled_features)[0][1]
    return pd

In [11]:
# Function to calculate the expected loss
def expected_loss(loan_amount_outstanding, pd, recovery_rate=0.1):
    # Calculate loss given no recovery
    loss_given_default = loan_amount_outstanding * (1 - recovery_rate)
    # Calculate expected loss
    el = pd * loss_given_default
    return el

In [12]:
# Model evaluation with ROC AUC score
y_pred_proba = logreg.predict_proba(X_test_scaled)[:, 1]
roc_value = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score of the model: {roc_value}")

ROC AUC Score of the model: 0.9999704294341932


In [13]:
# Example usage:
# Example borrower features (you should replace these with real values)
borrower_features = [3, 15000, 25000, 70000, 5, 680]

# Predicting the PD
pd = predict_default(borrower_features)

# Calculating the expected loss
loan_amount = borrower_features[1]  # Assuming the loan amount is at index 1
el = expected_loss(loan_amount, pd)
print(f"The expected loss for the loan is: ${el:.2f}")

The expected loss for the loan is: $13214.91




In [14]:
loan_data.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [17]:
fico = loan_data['fico_score'].to_numpy()

In [18]:
fico

array([605, 572, 602, ..., 596, 647, 757])

In [19]:
from sklearn.cluster import KMeans
import numpy as np

# Assume fico is a numpy array containing the FICO scores
# Define the number of buckets
k = 10

# Apply K-means clustering to find buckets that minimize MSE
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(fico.reshape(-1, 1))
centroids = np.sort(kmeans.cluster_centers_.flatten())

# Calculate bucket boundaries as the midpoint between centroids
boundaries = (centroids[:-1] + centroids[1:]) / 2

print("Bucket boundaries that minimize MSE:")
print(boundaries)



Bucket boundaries that minimize MSE:
[515.803125   557.74228499 589.35438287 616.17682916 640.25482563
 663.76367822 688.67258767 716.99561396 755.7019132 ]
