In [1]:
import pandas as pd
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
from src.model_training import CreditRiskModelTrainer




In [2]:
df_target = pd.read_csv("../data/processed/final_credit_data.csv")

In [3]:
df_target.shape

(64273, 18)

In [4]:
df_processed = pd.read_csv("../data/processed/processed.csv")

In [5]:
df_processed.shape

(64273, 16)

In [6]:
print(df_processed['CustomerId'].duplicated().sum())  # should be 0
print(df_target['CustomerId'].duplicated().sum())  # should be 0


61008
61008


In [7]:


# Load processed features and target
df_processed = pd.read_csv("../data/processed/processed.csv")
df_target = pd.read_csv("../data/processed/final_credit_data.csv")

# -------------------------------
# Keep ONE row per CustomerId
# -------------------------------
df_processed_unique = df_processed.drop_duplicates(subset=["CustomerId"])

df_target_unique = (
    df_target[["CustomerId", "is_high_risk"]]
    .drop_duplicates(subset=["CustomerId"])
)

# -------------------------------
# Merge safely (NO row explosion)
# -------------------------------
train_data = df_processed_unique.merge(
    df_target_unique,
    on="CustomerId",
    how="left"
)

# -------------------------------
# Save train-ready dataset
# -------------------------------
train_data_path = "../data/processed/train_data.csv"
train_data.to_csv(train_data_path, index=False)

print(f"Train-ready dataset saved to: {train_data_path}")
print("Final shape:", train_data.shape)


Train-ready dataset saved to: ../data/processed/train_data.csv
Final shape: (3265, 17)


In [8]:
train_dataset = pd.read_csv("../data/processed/train_data.csv")

In [9]:
print(train_data.is_high_risk.value_counts())


is_high_risk
0    2029
1    1236
Name: count, dtype: int64


In [10]:
train_dataset.shape

(3265, 17)

In [11]:
train_dataset.head()

Unnamed: 0,CustomerId,CurrencyCode,CountryCode,ProductCategory,ChannelId,Amount,Value,PricingStrategy,Total_Transaction_Amount,Average_Transaction_Amount,Transaction_Count,Std_Dev_Transaction_Amount,Transaction_Hour,Transaction_Day,Transaction_Month,Transaction_Year,is_high_risk
0,CustomerId_4406,-5.5511150000000004e-17,-5.5511150000000004e-17,-0.121582,-0.354644,0.432168,0.265403,-0.298064,-0.399336,-0.295118,-0.332133,0.824943,-2.136606,-0.192479,0.812629,-0.956226,0
1,CustomerId_4683,-5.5511150000000004e-17,-5.5511150000000004e-17,-0.121582,-0.354644,-0.104676,-0.381685,-0.298064,-0.609036,-0.283507,-0.558592,-3.075179,-2.136606,-0.192479,0.812629,-0.956226,1
2,CustomerId_988,-5.5511150000000004e-17,-5.5511150000000004e-17,-0.223512,0.184185,-1.332973,-0.195324,-0.298064,-0.546134,0.341345,-0.512843,0.619439,-1.933964,-0.192479,0.812629,-0.956226,0
3,CustomerId_1432,-5.5511150000000004e-17,-5.5511150000000004e-17,-0.121582,-0.354644,1.505854,1.55958,-0.298064,-0.604765,4.07851,-0.56088,-3.075179,-1.933964,-0.192479,0.812629,-0.956226,1
4,CustomerId_2858,-5.5511150000000004e-17,-5.5511150000000004e-17,-0.223512,0.184185,-1.178362,-0.381685,-0.298064,-0.573586,0.194238,-0.531142,1.782344,-1.933964,-0.192479,0.812629,-0.956226,0


In [13]:
# Initialize trainer
trainer = CreditRiskModelTrainer(train_dataset)

# Define hyperparameters
lr_params = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']}
rf_params = {'n_estimators': [100, 200], 'max_depth': [5, 10, None]}
gb_params = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}

# Train models
trainer.train_model('logistic_regression', params=lr_params)
trainer.train_model('random_forest', params=rf_params)
trainer.train_model('gradient_boosting', params=gb_params)

# Evaluate all models
print(trainer.evaluate_all_models())

# Get best model
best_model = trainer.best_model
best_model_name = trainer.best_model_name
print(f"Best model: {best_model_name}")

# Register best model in MLflow
trainer.register_best_model()

# Optional: promote to Production
trainer.promote_best_model_to_production()


logistic_regression trained. ROC-AUC: 0.9309
random_forest trained. ROC-AUC: 0.9709
gradient_boosting trained. ROC-AUC: 0.9669
                     accuracy  precision    recall  f1_score   roc_auc
logistic_regression  0.863706   0.821138  0.817814  0.819473  0.930895
random_forest        0.891271   0.879310  0.825911  0.851775  0.970902
gradient_boosting    0.888208   0.868644  0.829960  0.848861  0.966943
Best model: random_forest


2025/12/16 23:50:14 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/16 23:50:14 INFO mlflow.store.db.utils: Updating database tables
2025/12/16 23:50:14 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/16 23:50:14 INFO alembic.runtime.migration: Will assume non-transactional DDL.
Registered model 'Credit_Risk_Best_Model' already exists. Creating a new version of this model...
Created version '7' of model 'Credit_Risk_Best_Model'.


Best model 'random_forest' registered as 'Credit_Risk_Best_Model'.
Model Credit_Risk_Best_Model version 7 is now Production.


  versions = self.client.get_latest_versions(registered_model_name)
  self.client.transition_model_version_stage(
