In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error

data = pd.read_csv('Task 3 and 4_Loan_Data.csv')

X = data.drop(['customer_id', 'default'], axis=1)
y = data['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_scaled, y_train)

# Predict default probabilities on the test set
default_probs = logreg_model.predict_proba(X_test_scaled)[:, 1]

# Get FICO scores from the test set
fico_scores = X_test['fico_score']

# Calculate the mean squared error for different bucket boundaries
mse_values = []
num_buckets = 5  # You can adjust the number of buckets

for i in range(1, num_buckets):
    bucket_boundaries = np.percentile(fico_scores, np.linspace(0, 100, i + 1))
    bucket_predictions = [default_probs[(fico_scores >= bucket_boundaries[j]) & (fico_scores <= bucket_boundaries[j + 1])].mean() for j in range(i)]
    mse = mean_squared_error(bucket_predictions, y_test[:len(bucket_predictions)])  # Ensure lengths match
    mse_values.append((i, mse))

# Find the optimal number of buckets with minimum MSE
optimal_num_buckets = min(mse_values, key=lambda x: x[1])[0]
print("Optimal Number of Buckets:", optimal_num_buckets)

# Calculate bucket boundaries based on the optimal number of buckets
optimal_bucket_boundaries = np.percentile(fico_scores, np.linspace(0, 100, optimal_num_buckets + 1))

print("Optimal Bucket Boundaries:", optimal_bucket_boundaries)

# Calculate log-likelihood values for different bucket boundaries
log_likelihood_values = []
num_buckets = 5  # You can adjust the number of buckets

for i in range(1, num_buckets):
    bucket_boundaries = np.percentile(fico_scores, np.linspace(0, 100, i + 1))
    bucket_indices = np.digitize(fico_scores, bucket_boundaries) - 1

    bucket_probabilities = [default_probs[bucket_indices == j].mean() for j in range(i)]
    bucket_defaults = [y_test[bucket_indices == j].sum() for j in range(i)]
    bucket_total = [len(y_test[bucket_indices == j]) for j in range(i)]
    bucket_likelihood = sum([(default * np.log(prob) + (total - default) * np.log(1 - prob)) for default, prob, total in zip(bucket_defaults, bucket_probabilities, bucket_total)])

    log_likelihood_values.append((i, bucket_likelihood))

# Find the optimal number of buckets with maximum log-likelihood
optimal_num_buckets = max(log_likelihood_values, key=lambda x: x[1])[0]
print("Optimal Number of Buckets (Log-Likelihood):", optimal_num_buckets)

# Calculate bucket boundaries based on the optimal number of buckets
optimal_bucket_boundaries = np.percentile(fico_scores, np.linspace(0, 100, optimal_num_buckets + 1))

print("Optimal Bucket Boundaries (Log-Likelihood):", optimal_bucket_boundaries)


Optimal Number of Buckets: 1
Optimal Bucket Boundaries: [425. 831.]
Optimal Number of Buckets (Log-Likelihood): 3
Optimal Bucket Boundaries (Log-Likelihood): [425.         613.         664.66666667 831.        ]
