<a href="https://colab.research.google.com/github/c90x/loan_prediction/blob/main/loan_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loan Dataset - Model

## Dependencies

In [1]:
from pathlib import Path

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

Configure theme

In [3]:
plt.style.use("dark_background")

**Download** dataset from Kaggle

In [4]:
import kagglehub

# Download latest version
path = Path(kagglehub.dataset_download("ranadeep/credit-risk-dataset"))

path = path / "loan" / "loan.csv"

assert path.exists()
assert path.is_file()

print("Downloaded dataset to path:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloaded dataset to path: /home/cube/.cache/kagglehub/datasets/ranadeep/credit-risk-dataset/versions/3/loan/loan.csv


Load dataset

In [5]:
df = pd.read_csv(path)

  df = pd.read_csv(path)


## Preprocessing

Select columns

In [6]:
df_select = [
    "loan_status",  # e
    # ...
    "recoveries",
]

df = df[df_select]

Encode `loan_status`

In [7]:
# loan_target:
# 0: bad
# 1: good

df_loan_status_good = [
    "current",
    "fully paid",
    "issued",
]


def df_loan_status_map(x):
    if "fully paid" in x.lower():
        return 1
    return 1 if x.lower() in df_loan_status_good else 0


df["loan_target"] = df["loan_status"].map(df_loan_status_map).fillna(0)

In [8]:
df.drop("loan_status", axis=1, inplace=True)

## Model

Split data

In [9]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop(["loan_target"], axis=1)
y = df["loan_target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=99
)

Create and train model

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the random forest classifier
clf = RandomForestClassifier(random_state=99, n_estimators=10, max_depth=4)
clf.fit(X_train, y_train)

# Predict on the test data
clf_y_pred = clf.predict(X_test)

Create dummy classifier

In [11]:
from sklearn.dummy import DummyClassifier

# Create and train the dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=99)
dummy_clf.fit(X_train, y_train)

# Predict on the test data
dummy_y_pred = dummy_clf.predict(X_test)

### Classification Report

In [12]:
from sklearn.metrics import classification_report

In [13]:
# Generate the classification report
report = classification_report(
    y_test, clf_y_pred, target_names=["Bad Loans", "Good Loans"]
)
print(report)

              precision    recall  f1-score   support

   Bad Loans       1.00      0.36      0.53     13518
  Good Loans       0.95      1.00      0.97    163958

    accuracy                           0.95    177476
   macro avg       0.98      0.68      0.75    177476
weighted avg       0.95      0.95      0.94    177476



In [14]:
# Evaluate the dummy classifier
dummy_report = classification_report(
    y_test, dummy_y_pred, target_names=["Bad Loans", "Good Loans"], zero_division=0
)
print(dummy_report)

              precision    recall  f1-score   support

   Bad Loans       0.00      0.00      0.00     13518
  Good Loans       0.92      1.00      0.96    163958

    accuracy                           0.92    177476
   macro avg       0.46      0.50      0.48    177476
weighted avg       0.85      0.92      0.89    177476

