In [1]:
import sys
sys.path.append('../')

In [2]:
# 📦 Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from src.train import train_model, evaluate_model, save_model
from src.utils import load_data

In [3]:
df = load_data('../data/processed/processed_data.csv')
df.head()

Data loaded successfully from ../data/processed/processed_data.csv


Unnamed: 0,Amount,Value,Frequency,AvgAmount,AmountStdDev,Recency,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4,is_high_risk
0,0.643134,-0.03126,0.115536,-0.035832,-0.406095,-1.182989,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,-0.80955,-1.832223,0.115536,-0.035832,-0.406095,-1.182989,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,0.542114,-0.353843,-2.147,-0.189247,-5.996952,1.91118,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
3,1.080223,1.40465,-0.573815,0.433114,0.799973,-0.142576,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,-1.309389,-0.236095,-0.573815,0.433114,0.799973,-0.142576,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [4]:
# 🎯 Prepare features and target
X = df.drop(columns=["is_high_risk"])
y = df["is_high_risk"]

In [5]:
# ✂️ Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

In [6]:
from mlflow.tracking import MlflowClient

models = {
    "Logistic Regression": "logistic",
    "Random Forest": "random_forest",
    "Gradient Boosting": "xgboost"
}

results = []
returned_models = []

best_f1 = -1
best_model_info = {}

for display_name, internal_name in models.items():
    print(f"Training {display_name}...")

    model, run_id = train_model(X_train, y_train, model_name=internal_name)
    metrics = evaluate_model(model, X_test, y_test)

    f1 = metrics["f1_score"]

    results.append({
        "Model": display_name,
        "Accuracy": round(metrics["accuracy"], 3),
        "Precision": round(metrics["precision"], 3),
        "Recall": round(metrics["recall"], 3),
        "F1 Score": round(f1, 3),
        "ROC AUC": round(metrics["roc_auc"], 3)
    })

    returned_models.append(model)

    # Track the best model
    if f1 > best_f1:
        best_f1 = f1
        best_model_info = {
            "display_name": display_name,
            "internal_name": internal_name,
            "run_id": run_id
        }

Training Logistic Regression...




Training Random Forest...




Training Gradient Boosting...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [8]:
import mlflow
# ✅ Register the best model
print(f"\nRegistering best model: {best_model_info['display_name']} (F1 Score: {round(best_f1, 3)})")

client = MlflowClient()
model_uri = f"runs:/{best_model_info['run_id']}/model"
registered_model_name = f"CreditRisk_{best_model_info['internal_name']}_model"

result = mlflow.register_model(model_uri, registered_model_name)

client.transition_model_version_stage(
    name=registered_model_name,
    version=result.version,
    stage="Production",
    archive_existing_versions=True
)

Successfully registered model 'CreditRisk_xgboost_model'.



Registering best model: Gradient Boosting (F1 Score: 0.999)


Created version '1' of model 'CreditRisk_xgboost_model'.
  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1751379712536, current_stage='Production', deployment_job_state=None, description=None, last_updated_timestamp=1751379712636, metrics=[], model_id='m-2ee0c611e3bc4e239638cb470f1ebeb8', name='CreditRisk_xgboost_model', params={'max_depth': '3', 'n_estimators': '100'}, run_id='fc33e47344e44f7295136cdc5ed5077a', run_link=None, source='models:/m-2ee0c611e3bc4e239638cb470f1ebeb8', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [9]:
pd.DataFrame(results).set_index("Model")


Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Logistic Regression,0.994,0.991,0.964,0.977,0.999
Random Forest,0.999,1.0,0.995,0.998,1.0
Gradient Boosting,1.0,1.0,0.999,0.999,1.0


In [10]:
y.value_counts(normalize=True)

is_high_risk
0    0.87276
1    0.12724
Name: proportion, dtype: float64