<a href="https://colab.research.google.com/github/boiBASH/Elite-Bank-Project/blob/main/Data_Transformation_and_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install mlflow
!pip install pyngrok
!pip install catboost
!pip install xgboost

In [18]:
import pandas as pd
import numpy as np
import mlflow
import subprocess
from pyngrok import ngrok, conf
import getpass
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [4]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI])

<Popen: returncode: None args: ['mlflow', 'ui', '--backend-store-uri', 'sqli...>

In [5]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
# mlflow will create an experiment if it doesn't exist
mlflow.set_experiment("EliteBank ML")

2025/03/10 02:14:40 INFO mlflow.tracking.fluent: Experiment with name 'EliteBank ML' does not exist. Creating a new experiment.


<Experiment: artifact_location='/content/mlruns/1', creation_time=1741572880711, experiment_id='1', last_update_time=1741572880711, lifecycle_stage='active', name='EliteBank ML', tags={}>

In [6]:
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
conf.get_default().auth_token = getpass.getpass()
port=5000
public_url = ngrok.connect(port).public_url
print(f' * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"')

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth
··········
 * ngrok tunnel "https://aced-35-221-183-84.ngrok-free.app" -> "http://127.0.0.1:5000"


In [3]:
df = pd.read_csv("/content/Bank_Marketing_Dataset.csv")

In [8]:
# Select the column types
scale_columns = [
    "age",
    "balance",
    "day",
    "duration"
]

categorical_columns = df.select_dtypes(include = ["object"]).columns.tolist()
categorical_columns.remove("deposit")

In [9]:
# Extract features and labels from dataset
X, y = df.drop(labels = ["deposit"], axis = 1), df["deposit"]

In [10]:
# Encode labels
map_dictionary = {
    "yes": 1,
    "no": 0
}

y = y.apply(lambda x: map_dictionary[x])

In [11]:
# Separate into train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y)

In [12]:
# Implement data preparation transformer
def get_transformer(categorical_columns, scale_columns, one_hot=False):
    if one_hot:
        transformer = ColumnTransformer(
            transformers=[
                ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
                ("scale", StandardScaler(), scale_columns)
            ],
            remainder="passthrough"
        )
    else:
        transformer = ColumnTransformer(
            transformers=[
                ("ordinal", OrdinalEncoder(), categorical_columns),
                ("scale", StandardScaler(), scale_columns)
            ],
            remainder="passthrough"
        )
    return transformer

In [15]:
# Instantiate model pipelines
log_pipe = Pipeline(
    steps=[
        ("preprocess", get_transformer(categorical_columns, scale_columns, one_hot=True)),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("preprocess", get_transformer(categorical_columns, scale_columns, one_hot=False)),
        ("model", CatBoostClassifier(verbose=0, random_seed=42))
    ]
)

extra_pipe = Pipeline(
    steps=[
        ("preprocess", get_transformer(categorical_columns, scale_columns, one_hot=False)),
        ("model", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
    ]
)

In [16]:
# Function to train, evaluate, and log models in MLflow
def train_and_log_pipeline(pipeline, model_name):
    with mlflow.start_run(run_name=model_name):

        # Train model
        pipeline.fit(X_train, y_train)

        # Make predictions
        y_pred = pipeline.predict(X_test)
        y_prob = pipeline.predict_proba(X_test)[:, 1]

        # Compute evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)

        # Compute specificity
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else None

        # Log metrics in MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("roc_auc", roc_auc)
        if specificity is not None:
            mlflow.log_metric("specificity", specificity)

        # Log model
        mlflow.sklearn.log_model(pipeline, model_name)

        print(f"✅ Model {model_name} logged successfully in MLflow!")

# Set up MLflow experiment
mlflow.set_experiment("Long-Term Investor Prediction")

2025/03/10 02:22:39 INFO mlflow.tracking.fluent: Experiment with name 'Long-Term Investor Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='/content/mlruns/2', creation_time=1741573359862, experiment_id='2', last_update_time=1741573359862, lifecycle_stage='active', name='Long-Term Investor Prediction', tags={}>

In [19]:
# Train and log all models
train_and_log_pipeline(log_pipe, "Logistic Regression")
train_and_log_pipeline(cat_pipe, "CatBoost")
train_and_log_pipeline(extra_pipe, "XGBoost")



✅ Model Logistic Regression logged successfully in MLflow!


Parameters: { "use_label_encoder" } are not used.



✅ Model CatBoost logged successfully in MLflow!




✅ Model XGBoost logged successfully in MLflow!
