# Model Training - Evaluation

## Imports

In [31]:
# Core
import pandas as pd

# Model & Preprocessing
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

# Metrics
from sklearn.metrics import accuracy_score

# Paths
from src.__00__paths import curated_data_dir, model_dir

## Load Curated Data

In [32]:
train_df = pd.read_csv(curated_data_dir / "train.csv")

test_df = pd.read_csv(curated_data_dir / "test.csv")

train_df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1.475067,0,0,1.012245,1.633052,-0.858444,-0.185233,-0.257212,2.034167,0.084996,1.145702,1
1,-1.473548,0,1,1.617979,2.086476,0.192617,0.638384,2.510795,1.213832,2.523836,0.868762,1
2,-0.883825,1,0,-1.517587,-1.352913,-0.508091,0.475981,-0.810813,-0.882579,-1.343153,-1.315984,1
3,-0.294102,1,1,-0.056698,-0.224882,1.243677,0.89359,0.111856,-0.381263,0.150911,0.684136,1
4,1.475067,1,1,-0.733695,-0.446064,0.192617,-0.921848,0.157989,-0.56356,-0.85978,-0.91596,0


In [33]:
x_train = train_df.drop(columns=['loan_status'])
y_train = train_df['loan_status']

x_test = test_df.drop(columns=['loan_status'])
y_test = test_df['loan_status']

## Model Configuration

In [34]:
models = {
    "LogisticRegression": {
        "clf": LogisticRegression(max_iter=1000, n_jobs=-1),
        "params": {
            "clf__C": [0.1, 1, 10],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs", "liblinear"],
            "smote__k_neighbors": [3, 5]
        }
    },
    "SVM": {
        "clf": SVC(probability=True),
        "params": {
            "clf__C": [0.5, 1, 5],
            "clf__kernel": ["rbf", "linear"],
            "clf__gamma": ["scale", "auto"],
            "smote__k_neighbors": [3, 5]
        }
    }
}


## Cross Validation & Training

In [35]:
RANDOM_STATE = 42

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

results = {}

for name, config in models.items():
    print(f"\n===== Training {name} =====")

    # Remove n_jobs from LogisticRegression to avoid warning
    if name == "LogisticRegression":
        config["clf"].set_params(n_jobs=None)

    # Create pipeline (SMOTE + model)
    pipe = ImbPipeline(steps=[
        ("smote", SMOTE(random_state=RANDOM_STATE)),
        ("clf", config["clf"])
    ])

    # Grid search
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=config["params"],
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        refit=True
    )

    gs.fit(x_train, y_train)  # Make sure variable name is X_train not x_train
    results[name] = gs

    print("Best Params:", gs.best_params_)
    print("Best CV F1:", gs.best_score_)


===== Training LogisticRegression =====
Best Params: {'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'smote__k_neighbors': 5}
Best CV F1: 0.9402408667552248

===== Training SVM =====
Best Params: {'clf__C': 5, 'clf__gamma': 'auto', 'clf__kernel': 'rbf', 'smote__k_neighbors': 3}
Best CV F1: 0.9557809241009922


## Show Accuracy

In [36]:
for name, gs in results.items():
    y_pred = gs.predict(x_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Test Accuracy: {acc * 100:.2f}%")

LogisticRegression Test Accuracy: 91.57%
SVM Test Accuracy: 93.68%


## Save Model

In [37]:
# Save best models after training loop
for name, gs in results.items():
    model_path = model_dir / f"{name}_model.joblib"
    joblib.dump(gs.best_estimator_, model_path)
    print(f"✔️ Saved {name} model to: {'/'.join(model_path.parts[-3:])}")

✔️ Saved LogisticRegression model to: outputs/models/LogisticRegression_model.joblib
✔️ Saved SVM model to: outputs/models/SVM_model.joblib
