# 🏦 Loan Default Risk Prediction

This notebook predicts whether a borrower is likely to default on a loan using a Random Forest Classifier. We'll simulate a Lending Club–style dataset, clean and preprocess the data, and evaluate the model.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings("ignore")

## 📂 Load Dataset

In [None]:
df = pd.read_csv("loan_data.csv")
df.head()

## 🧼 Data Cleaning

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

## 🎯 Encode Target Variable

In [None]:
df['loan_status'].value_counts()
df['target'] = df['loan_status'].map({"Fully Paid": 0, "Charged Off": 1})

## 📊 Exploratory Analysis (Optional)

In [None]:
sns.countplot(data=df, x='loan_status')
plt.title("Loan Status Distribution")
plt.show()

## 🧠 Feature Selection

In [None]:
X = df.drop(columns=["loan_status", "target"])
y = df["target"]

numeric_features = ["loan_amnt", "int_rate", "installment", "annual_inc", "dti"]
categorical_features = [col for col in X.columns if col not in numeric_features]

## 🔀 Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

## ⚙️ Preprocessing Setup

In [None]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

## 🤖 Model Training

In [None]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])
pipeline.fit(X_train, y_train)

## 📈 Model Evaluation

In [None]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()