In [35]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_recall_curve, accuracy_score, roc_auc_score, classification_report
from sklearn.utils import resample
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier


In [36]:
df = pd.read_csv('/content/train_lending_club.csv')
df_test = pd.read_csv('/content/test_lending_club.csv')
print("Train shape:", df.shape)
print("Test shape:", df_test.shape)


Train shape: (236846, 27)
Test shape: (95019, 27)


In [37]:
numeric_cols = df.select_dtypes(include=np.number).columns
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]
print("Rows after removing outliers:", len(df))


Rows after removing outliers: 212713


In [38]:
for df_ in [df, df_test]:
    df_["fico_score"] = (df_["fico_range_low"] + df_["fico_range_high"]) / 2
    df_.drop(["fico_range_low", "fico_range_high"], axis=1, inplace=True)

keep_cols = [
    'sub_grade','fico_score','annual_inc','mort_acc','initial_list_status',
    'time_to_earliest_cr_line','emp_length','revol_bal','term','home_ownership',
    'revol_util','dti','int_rate','loan_amnt','verification_status'
]
target = 'loan_status'

df = df[keep_cols + [target]]
df_test = df_test[keep_cols + [target]]

categorical_cols = ["sub_grade","term","home_ownership","verification_status","initial_list_status"]


In [39]:
for df_ in [df, df_test]:
    df_['debt_to_income'] = df_['loan_amnt'] / df_['annual_inc']
    df_['available_revol_credit'] = df_['revol_bal'] / (df_['revol_util'] + 1e-6)

drop_cols = ['loan_amnt','annual_inc','revol_bal','revol_util']
X = df.drop(columns=[target]+drop_cols)
y = df[target]
X_test = df_test.drop(columns=[target]+drop_cols)
y_test = df_test[target]


In [40]:
mask = ~y_test.isna()
X_test = X_test[mask]
y_test = y_test[mask]

for df_ in [X, X_test]:
    df_.replace([np.inf, -np.inf], np.nan, inplace=True)


In [42]:
from sklearn.compose import ColumnTransformer

categorical_cols = [c for c in categorical_cols if c in X.columns]
numerical_cols = [c for c in X.columns if c not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)


In [44]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LGBMClassifier(
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        class_weight={0:1, 1:5},
        random_state=42
    ))
])

pipeline.fit(X, y)


[LightGBM] [Info] Number of positive: 177280, number of negative: 35433
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1323
[LightGBM] [Info] Number of data points in the train set: 212713, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.961562 -> initscore=3.219525
[LightGBM] [Info] Start training from score 3.219525


In [45]:
probs = pipeline.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, probs)
f1_scores = 2*(precision*recall)/(precision+recall+1e-9)
best_thresh = thresholds[np.argmax(f1_scores)]
preds = (probs >= best_thresh).astype(int)

print("Optimal Threshold:", best_thresh)
print("Accuracy:", accuracy_score(y_test, preds))
print("AUC:", roc_auc_score(y_test, probs))
print("\nClassification Report:\n", classification_report(y_test, preds))




Optimal Threshold: 0.8290423693475754
Accuracy: 0.8694328415609943
AUC: 0.8363858709477776

Classification Report:
               precision    recall  f1-score   support

         0.0       0.68      0.33      0.44     14748
         1.0       0.89      0.97      0.93     78859

    accuracy                           0.87     93607
   macro avg       0.78      0.65      0.68     93607
weighted avg       0.85      0.87      0.85     93607



In [47]:
import pickle
with open("loan_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)
