In [14]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb


In [15]:
# Cell 2: Load Dataset (fixed)
data_path = "train.csv"   # direct file path

df = pd.read_csv(data_path)

print(df.head())
print(df.shape)
print(df.info())
print(df.isnull().sum())


   id  annual_income  debt_to_income_ratio  credit_score  loan_amount  \
0   0       29367.99                 0.084           736      2528.42   
1   1       22108.02                 0.166           636      4593.10   
2   2       49566.20                 0.097           694     17005.15   
3   3       46858.25                 0.065           533      4682.48   
4   4       25496.70                 0.053           665     12184.43   

   interest_rate  gender marital_status education_level employment_status  \
0          13.67  Female         Single     High School     Self-employed   
1          12.92    Male        Married        Master's          Employed   
2           9.76    Male         Single     High School          Employed   
3          16.10  Female         Single     High School          Employed   
4          10.21    Male        Married     High School          Employed   

         loan_purpose grade_subgrade  loan_paid_back  
0               Other             C3       

In [16]:
# Cell 3: Preprocessing
if "loan_paid_back" not in df.columns:
    raise ValueError("Target column 'loan_paid_back' not found.")

X = df.drop("loan_paid_back", axis=1)
y = df["loan_paid_back"]

X = pd.get_dummies(X, drop_first=True)

numeric_cols = df.select_dtypes(include=[np.number]).columns
df_clean = df.copy()
for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

if 'id' in df_clean.columns:
    df_clean = df_clean.drop('id', axis=1)

categorical_cols = df_clean.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df_clean[col] = le.fit_transform(df_clean[col])

X = df_clean.drop('loan_paid_back', axis=1)
y = df_clean['loan_paid_back']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


print("Class distribution in full dataset:")
print(y.value_counts())

print("Class distribution in training set:")
print(y_train.value_counts())

print("Class distribution in test set:")
print(y_test.value_counts())


Class distribution in full dataset:
loan_paid_back
1.0    444583
Name: count, dtype: int64
Class distribution in training set:
loan_paid_back
1.0    355666
Name: count, dtype: int64
Class distribution in test set:
loan_paid_back
1.0    88917
Name: count, dtype: int64


In [17]:
print("Full dataset class distribution:")
print(y.value_counts())

print("Training set class distribution:")
print(y_train.value_counts())

print("Test set class distribution:")
print(y_test.value_counts())


Full dataset class distribution:
loan_paid_back
1.0    444583
Name: count, dtype: int64
Training set class distribution:
loan_paid_back
1.0    355666
Name: count, dtype: int64
Test set class distribution:
loan_paid_back
1.0    88917
Name: count, dtype: int64


In [18]:
# Cell 4: Model Ensemble Development
models = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

param_grid = {
    "LogisticRegression": {"C": [0.1, 1, 10]},
    "RandomForest": {"n_estimators": [100, 200], "max_depth": [5, 10]},
    "GradientBoosting": {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1]},
    "XGBoost": {"n_estimators": [100, 200], "learning_rate": [0.05, 0.1]}
}

best_models = {}
for name, model in models.items():
    grid = GridSearchCV(model, param_grid[name], cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    best_models[name] = grid.best_estimator_
    print(f"{name} Best Params: {grid.best_params_}")
    print(f"{name} Best CV Score: {grid.best_score_:.4f}")


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\USER\aline205rp20978\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\USER\aline205rp20978\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\USER\aline205rp20978\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1335, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.float64(1.0)


In [None]:
# Cell 5: Cross-Validation and Best Model Selection
cv_results = {}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in best_models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring="accuracy")
    cv_results[name] = scores.mean()
    print(f"{name} CV Accuracy: {scores.mean():.4f}")

best_model_name = max(cv_results, key=cv_results.get)
best_model = best_models[best_model_name]
print(f"Best Model: {best_model_name}")


In [None]:
# Cell 6: Model Persistence
joblib.dump(best_model, "best_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(list(df_clean.columns), "columns.pkl")
joblib.dump((X_train_scaled, X_test_scaled, y_train, y_test), "scaled_dataset.pkl")


In [None]:
# Cell 7: Baseline Predictions
y_pred = best_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

baseline_df = pd.DataFrame({
    "Actual": y_test.values,
    "Predicted": y_pred
})
print(baseline_df.head(20))
