In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib


In [None]:
dataset = pd.read_csv("dataset/train.csv")

print("Dataset shape:", dataset.shape)
dataset.head()


In [None]:
print(dataset.isnull().sum())

print("\nTarget distribution:")
print(dataset['loan_paid_back'].value_counts(normalize=True))


In [None]:
dataset = dataset.drop(columns=['id'], errors='ignore')


In [None]:
TARGET = 'loan_paid_back'

X = dataset.drop(columns=[TARGET])
y = dataset[TARGET]

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = [c for c in X.columns if c not in num_cols]

le = LabelEncoder()
for col in cat_cols:
    X[col] = le.fit_transform(X[col].astype(str))

print("Categorical features encoded:", len(cat_cols))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [None]:

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {
    "XGBoost": XGBClassifier(
        n_estimators=1000,
        max_depth=6,
        learning_rate=0.1,
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}



In [None]:
param_grids = {
    "XGBoost": {
        "n_estimators": [500, 1000],
        "max_depth": [3, 6, 8],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "reg_lambda": [1, 3, 5],   # L2 regularization
        "reg_alpha": [0, 0.5, 1]   # L1 regularization
    },
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5]
    }
}


In [None]:
best_models = {}
results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=cv,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X_train_scaled, y_train)
    
    best_models[name] = grid.best_estimator_
    
    print("Best Parameters:", grid.best_params_)
    
    y_pred = grid.best_estimator_.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    
    results.append((name, acc))
    
    print("Test Accuracy:", acc)


In [None]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy'])
results_df.sort_values(by='Accuracy', ascending=False)


In [None]:
best_model_name = results_df.sort_values(by='Accuracy', ascending=False).iloc[0]['Model']
best_model = best_models[best_model_name]

print("BEST MODEL:", best_model_name)


In [None]:
train_acc = accuracy_score(y_train, best_model.predict(X_train_scaled))
test_acc = accuracy_score(y_test, best_model.predict(X_test_scaled))

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy : {test_acc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, best_model.predict(X_test_scaled)))


In [None]:
joblib.dump(best_model, "best_loan_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")
joblib.dump(X_train_scaled, "scaled_dataset.pkl")

print("All artifacts saved successfully!")


In [None]:
# Example: take 5 unseen samples from test set
new_samples = X_test.iloc[:5]

new_samples_scaled = scaler.transform(new_samples)

predictions = best_model.predict(new_samples_scaled)
probabilities = best_model.predict_proba(new_samples_scaled)[:, 1]

baseline_df = pd.DataFrame({
    "Prediction": predictions,
    "Probability": probabilities
})

baseline_df
