# Lending Club Credit Risk Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve

np.random.seed(42)
plt.style.use('seaborn')

In [None]:
# Load raw and preprocessed data
raw_df = pd.read_csv('src/data/lending_club_raw_data.csv')
golden_df = pd.read_csv('src/data/lending_club_golden_data.csv')

print("Raw dataset shape:", raw_df.shape)
print("Golden dataset shape:", golden_df.shape)

# Display sample of raw data
print("\nRaw data sample:")
raw_df.head()

In [None]:
# Compare raw vs golden data
print("Raw data columns:", raw_df.columns.tolist())
print("\nGolden data additional features:", 
      [col for col in golden_df.columns if col not in raw_df.columns])

In [None]:
# Quick data overview
print("\nMissing values in raw data:")
print(raw_df.isnull().sum()[raw_df.isnull().sum() > 0])

print("\nData types:")
print(raw_df.dtypes)

print("\nTarget distribution:")
print(raw_df['loan_status'].value_counts(normalize=True))

In [None]:
# Basic EDA plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# FICO Score distribution
sns.histplot(data=raw_df, x='fico_score', ax=axes[0,0])
axes[0,0].set_title('FICO Score Distribution')

# DTI distribution
sns.histplot(data=raw_df, x='dti', ax=axes[0,1])
axes[0,1].set_title('DTI Distribution')

# Loan amount vs Annual income
sns.scatterplot(data=raw_df, x='annual_income', y='loan_amount', alpha=0.5, ax=axes[1,0])
axes[1,0].set_title('Loan Amount vs Annual Income')

# Default rate by grade
df_default = raw_df.groupby('grade')['loan_status'].apply(lambda x: (x == 'Default').mean())
sns.barplot(x=df_default.index, y=df_default.values, ax=axes[1,1])
axes[1,1].set_title('Default Rate by Grade')

plt.tight_layout()
plt.show()

In [None]:
# Examine engineered features in golden dataset
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# FICO category distribution
sns.countplot(data=golden_df, x='fico_category', ax=axes[0,0])
axes[0,0].set_title('FICO Category Distribution')
axes[0,0].tick_params(axis='x', rotation=45)

# DTI category distribution
sns.countplot(data=golden_df, x='dti_category', ax=axes[0,1])
axes[0,1].set_title('DTI Category Distribution')
axes[0,1].tick_params(axis='x', rotation=45)

# Log-transformed income distribution
sns.histplot(data=golden_df, x='log_income', ax=axes[1,0])
axes[1,0].set_title('Log Income Distribution')

# Log-transformed loan amount distribution
sns.histplot(data=golden_df, x='log_loan_amount', ax=axes[1,1])
axes[1,1].set_title('Log Loan Amount Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Prepare features for modeling using golden dataset
numeric_features = ['log_income', 'log_loan_amount', 'dti', 'fico_score']
categorical_features = ['term', 'grade', 'home_ownership', 'purpose', 'fico_category', 'dti_category']

# Scale numeric features
scaler = StandardScaler()
X_numeric = scaler.fit_transform(golden_df[numeric_features])

# One-hot encode categorical features
X_categorical = pd.get_dummies(golden_df[categorical_features], drop_first=True)

# Combine features
X = np.hstack([X_numeric, X_categorical])
y = (golden_df['loan_status'] == 'Default').astype(int)

print(f"Final feature matrix shape: {X.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

In [None]:
# Train model
model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    min_samples_split=200,
    random_state=42
)

model.fit(X_train, y_train)
print("Model training completed")

In [None]:
# Evaluate model
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC-AUC Score: {roc_auc:.3f}')

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve (AUC = {roc_auc:.3f})')
plt.show()

In [None]:
# Feature importance
feature_names = numeric_features + list(X_categorical.columns)
importances = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importances_
})
importances = importances.sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(data=importances, x='importance', y='feature')
plt.title('Top 10 Feature Importances')
plt.show()