<a href="https://colab.research.google.com/github/boiBASH/Sterling-Bank-Data-Science-Assessment./blob/main/Model_Building_Evaluation_with_Dagshub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
%%capture
!pip install -q dagshub 'mlflow>=2,<3'

In [30]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
import mlflow
import dagshub
import getpass
import seaborn as sns
from mlflow.models.signature import infer_signature
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [3]:
dagshub.init(repo_owner='boiBASH', repo_name='Sterling-Bank-Data-Science-Assessment.', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=f7e8c452-2725-4d0a-ac52-b60f5cab9aa7&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b416b72217e363668653d0eb0877f4494f16ffe8a35e6bd4cb5c5a20bb8e03f3




In [4]:
# Load Dataset
df = pd.read_excel('/content/drive/MyDrive/Sterling Bank Assessment/encoded_model_ready_loan_data.xlsx')
df

Unnamed: 0,sector,PRODUCT_GROUP_NAME,FACILITY_TYPE,Contract_Amount,CURR_BAL,EQUIV_BALANCE,CONTRACT_MAT_DATE,report_date,PayinAccount_Last_LOD_Date,INTEREST_RATE,...,Default_status,previous_loans_count,previous_loans_amount,running_loans_count,running_loans_amount,previous_loan_default_count,employment_status,age,loan_age_days,customer_tenure_days
0,1,1,3,6.252801e+05,1.217761e+05,1.217761e+05,2020-06-24,2023-05-05,2020-02-07,0,...,1,4,1.165280e+06,0,0.00,1,0,33.0,2591,2101
1,1,1,3,6.252801e+05,1.217761e+05,1.217761e+05,2020-06-24,2023-05-05,2020-02-07,0,...,1,4,1.165280e+06,0,0.00,1,0,33.0,2591,3516
2,1,0,1,1.801259e+04,4.200000e+00,4.200000e+00,2017-08-26,2021-05-31,2019-06-28,0,...,1,2,3.669140e+04,0,0.00,2,4,,3048,3265
3,1,0,1,5.176228e+06,1.975372e+06,1.975372e+06,2018-02-28,2021-05-31,2017-07-19,0,...,1,1,5.176228e+06,0,0.00,1,4,,3042,3327
4,1,0,1,5.176228e+06,1.975372e+06,1.975372e+06,2018-02-28,2021-05-31,2017-07-19,0,...,1,1,5.176228e+06,0,0.00,1,4,,3042,3043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1575,3,0,1,2.452215e+06,2.452215e+06,2.452215e+06,2023-06-01,2023-06-27,2023-06-02,52,...,1,1,2.452215e+06,1,2786392.55,1,0,51.0,1349,1447
1576,3,0,1,2.786393e+06,2.786393e+06,2.786393e+06,2023-09-28,2023-08-23,2023-06-30,52,...,0,1,2.452215e+06,1,2786392.55,1,0,51.0,764,4096
1577,3,0,1,2.786393e+06,2.786393e+06,2.786393e+06,2023-09-28,2023-08-23,2023-06-30,52,...,0,1,2.452215e+06,1,2786392.55,1,0,51.0,764,1447
1578,1,2,2,1.000000e+07,9.745611e+06,9.745611e+06,2023-07-28,2023-08-23,2023-08-21,44,...,1,1,1.000000e+07,0,0.00,1,0,49.0,3372,3372


# **Define train and log function**

In [34]:
def train_and_log_pipeline(pipeline, model_name, model_step="clf", model_type="tree"):
    with mlflow.start_run(run_name=model_name):
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_prob = pipeline.predict_proba(X_test)[:, 1]

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc_auc = roc_auc_score(y_test, y_prob)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else None

        # Build transformed input example for signature
        imputed = pipeline.named_steps["imputer"].transform(X_test)
        scaled = pipeline.named_steps["scaler"].transform(imputed)
        input_example = scaled

        signature = infer_signature(input_example, y_pred)
        mlflow.sklearn.log_model(pipeline, model_name, signature=signature)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)
        if specificity is not None:
            mlflow.log_metric("specificity", specificity)

        # Feature importance
        model = pipeline.named_steps[model_step]
        feature_names = X.columns.tolist()

        if model_type == "tree":
            importances = model.feature_importances_
            title = "Random Forest Feature Importances"
        else:  # logistic regression
            r = permutation_importance(
                pipeline, X_test, y_test,
                n_repeats=10, random_state=42,
                scoring="f1"
            )
            importances = r.importances_mean
            title = "Logistic Regression Permutation Importances"

        fi_df = (
            pd.DataFrame({"feature": feature_names, "importance": importances})
            .sort_values("importance", ascending=False)
            .reset_index(drop=True)
        )

        # Log CSV
        fi_csv = f"{model_name}_feature_importances.csv"
        fi_df.to_csv(fi_csv, index=False)
        mlflow.log_artifact(fi_csv)

        # Plot top 20
        top_n = min(20, len(fi_df))
        plt.figure(figsize=(8, 6))
        plt.barh(
            fi_df["feature"].head(top_n)[::-1],
            fi_df["importance"].head(top_n)[::-1]
        )
        plt.title(title)
        plt.xlabel("Importance")
        plt.tight_layout()
        plot_path = f"{model_name}_feature_importance_plot.png"
        plt.savefig(plot_path, bbox_inches="tight")
        mlflow.log_artifact(plot_path)
        plt.close()

        print(f"✅ Logged feature importance for {model_name}")
        print(f"✅ Model {model_name} logged in MLflow")

# **Data Prepation for training**

In [35]:
leak_cols = ['DAYS_TO_MATURITY', 'CONTRACT_MAT_DATE', 'report_date', 'PayinAccount_Last_LOD_Date']
leak_cols_in_df = [c for c in leak_cols if c in df.columns]

X = df.drop(columns=['Default_status'] + leak_cols_in_df)
y = df['Default_status']

# Cast integer columns to float64 for MLflow schema safety
int_cols = X.select_dtypes(include=['int']).columns.tolist()
if int_cols:
    X[int_cols] = X[int_cols].astype('float64')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)


# **Pipelines**

In [36]:
# Random Forest Pipeline
rf_pipeline = ImbPipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))
])

lr_pipeline = ImbPipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

# **Train and log model**

In [37]:
train_and_log_pipeline(
    pipeline=lr_pipeline,
    model_name="LogisticRegression_SMOTE",
    model_step="clf",
    model_type="linear"
)

✅ Logged feature importance for LogisticRegression_SMOTE
✅ Model LogisticRegression_SMOTE logged in MLflow
🏃 View run LogisticRegression_SMOTE at: https://dagshub.com/boiBASH/Sterling-Bank-Data-Science-Assessment..mlflow/#/experiments/0/runs/ae022966fcfe4effbbd4c9568d658f27
🧪 View experiment at: https://dagshub.com/boiBASH/Sterling-Bank-Data-Science-Assessment..mlflow/#/experiments/0


In [40]:
train_and_log_pipeline(
    pipeline=rf_pipeline,
    model_name="RandomForest_SMOTE_Optimized",
    model_step="clf",
    model_type="tree"
)

✅ Logged feature importance for RandomForest_SMOTE_Optimized
✅ Model RandomForest_SMOTE_Optimized logged in MLflow
🏃 View run RandomForest_SMOTE_Optimized at: https://dagshub.com/boiBASH/Sterling-Bank-Data-Science-Assessment..mlflow/#/experiments/0/runs/ce0b554b8f2547b29d5e26427eaeb827
🧪 View experiment at: https://dagshub.com/boiBASH/Sterling-Bank-Data-Science-Assessment..mlflow/#/experiments/0
