In [3]:
# !pip -q install torch torchvision torchaudio scikit-learn==1.5.2 joblib==1.4.2

In [4]:
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import roc_auc_score, f1_score
import joblib, numpy as np

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [5]:
import pandas as pd

# Paths
ACC_PATH = "/kaggle/input/lending-club-accepted-2007-to-2018/accepted_2007_to_2018Q4.csv"
usecols_accepted = [
    "loan_amnt","term","int_rate","installment","grade","sub_grade",
    "emp_length","home_ownership","annual_inc","verification_status",
    "issue_d","purpose","addr_state","dti","delinq_2yrs","fico_range_low","fico_range_high",
    "inq_last_6mths","open_acc","pub_rec","revol_bal","revol_util",
    "total_acc","application_type","loan_status"
]

df = pd.read_csv(ACC_PATH, usecols=usecols_accepted, low_memory=False, nrows=500_000)

# Target mapping
default_labels = ["Charged Off","Default","Late (31-120 days)","Does not meet the credit policy. Status:Charged Off"]
paid_labels = ["Fully Paid","Does not meet the credit policy. Status:Fully Paid"]
df = df[df["loan_status"].isin(default_labels + paid_labels)].copy()
df["target"] = np.where(df["loan_status"].isin(default_labels), 1, 0)

# Derived features
df["fico_mid"] = (df["fico_range_low"] + df["fico_range_high"]) / 2
df["issue_d"] = pd.to_datetime(df["issue_d"], format="%b-%Y")
df["issue_year"] = df["issue_d"].dt.year

selected_cols = [
    "loan_amnt","int_rate","installment","annual_inc","dti","fico_mid",
    "delinq_2yrs","inq_last_6mths","open_acc","pub_rec","revol_bal","revol_util","total_acc",
    "term","grade","emp_length","home_ownership","verification_status","purpose","application_type","addr_state","issue_year"
]

X = df[selected_cols]
y = df["target"]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

preprocessor = joblib.load("/kaggle/input/processed-object-from-task1-lending-club/pytorch/default/1/preprocessor.joblib")
X_train_prep = preprocessor.transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print("Train shape:", X_train_prep.shape)


Train shape: (315316, 107)


In [7]:
class LoanMLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


In [8]:
X_train_t = torch.tensor(X_train_prep, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).view(-1,1)
X_test_t  = torch.tensor(X_test_prep,  dtype=torch.float32)
y_test_t  = torch.tensor(y_test.values,  dtype=torch.float32).view(-1,1)

train_ds = TensorDataset(X_train_t, y_train_t)
test_ds  = TensorDataset(X_test_t, y_test_t)

train_dl = DataLoader(train_ds, batch_size=512, shuffle=True)
test_dl  = DataLoader(test_ds, batch_size=1024, shuffle=False)


In [9]:
pos_weight = torch.tensor([(len(y_train) - y_train.sum()) / y_train.sum()], dtype=torch.float32)
model = LoanMLP(in_dim=X_train_prep.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(train_dl):.4f}")


Epoch 1/10 - Loss: 0.4567
Epoch 2/10 - Loss: 0.4507
Epoch 3/10 - Loss: 0.4496
Epoch 4/10 - Loss: 0.4485
Epoch 5/10 - Loss: 0.4476
Epoch 6/10 - Loss: 0.4470
Epoch 7/10 - Loss: 0.4464
Epoch 8/10 - Loss: 0.4462
Epoch 9/10 - Loss: 0.4455
Epoch 10/10 - Loss: 0.4453


In [10]:
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test_t.to(device)).cpu().numpy().ravel()
    y_pred = (y_pred_proba >= 0.5).astype(int)

auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)

print(f"AUC: {auc:.4f},  F1: {f1:.4f}")


AUC: 0.7410,  F1: 0.2277


In [13]:
# import numpy as np
# from sklearn.metrics import f1_score

# thresholds = np.linspace(0.1, 0.9, 9)
# for t in thresholds:
#     preds = (y_pred_proba >= t).astype(int)
#     f1 = f1_score(y_test, preds)
#     print(f"Threshold {t:.1f}: F1 = {f1:.3f}")


In [12]:
torch.save(model.state_dict(), "/kaggle/working/dl_best.pt")
print("Model saved")

Model saved
