<a href="https://colab.research.google.com/github/c90x/loan_prediction/blob/main/loan_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loan Dataset - Model

## Dependencies

In [None]:
from pathlib import Path

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Configure theme

In [None]:
plt.style.use("dark_background")

**Download** dataset from Kaggle

In [None]:
import kagglehub

# Download latest version
path = Path(kagglehub.dataset_download("ranadeep/credit-risk-dataset"))

path = path / "loan" / "loan.csv"

assert path.exists()
assert path.is_file()

print("Downloaded dataset to path:", path)

Load dataset

In [None]:
df = pd.read_csv(path)

## Preprocessing

Select columns

In [None]:
df_select = [
    "loan_status",  # e
    # ...
    "recoveries",
]

df = df[df_select]

Encode `loan_status`

In [None]:
# loan_target:
# 0: bad
# 1: good

df_loan_status_good = [
    "current",
    "fully paid",
    "issued",
]


def df_loan_status_map(x):
    if "fully paid" in x.lower():
        return 1
    return 1 if x.lower() in df_loan_status_good else 0


df["loan_target"] = df["loan_status"].map(df_loan_status_map).fillna(0)

In [None]:
df.drop("loan_status", axis=1, inplace=True)

## Model

Split data

In [None]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(["loan_target"], axis=1)
y = df["loan_target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=99
)

Create and train model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest classifier
clf = RandomForestClassifier(random_state=99, n_estimators=10, max_depth=4)
clf.fit(X_train, y_train)

# Predict on the test data
clf_y_pred = clf.predict(X_test)

Create dummy classifier

In [None]:
from sklearn.dummy import DummyClassifier

# Create and train the dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=99)
dummy_clf.fit(X_train, y_train)

# Predict on the test data
dummy_y_pred = dummy_clf.predict(X_test)

### Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
# Generate the classification report
report = classification_report(
    y_test, clf_y_pred, target_names=["Bad Loans", "Good Loans"]
)
print(report)

In [None]:
# Evaluate the dummy classifier
dummy_report = classification_report(
    y_test, dummy_y_pred, target_names=["Bad Loans", "Good Loans"], zero_division=0
)
print(dummy_report)