In [1]:
import sys
import os
import pandas as pd

# Step 1: Automatically find the project root (folder containing 'src')
current_dir = os.getcwd()
project_root = current_dir

while not os.path.exists(os.path.join(project_root, 'src')):
    parent = os.path.dirname(project_root)
    if parent == project_root:
        raise FileNotFoundError("Could not find 'src' folder in any parent directories!")
    project_root = parent

# Step 2: Add project root to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Project root detected: {project_root}")

# Step 4: Load your CSV file
final_df = r'E:\Credit-Risk-Probability-Model-for-Alternative-Data-wek_4\data\processed\train_data.csv'
try:
    df = pd.read_csv(final_df)
    print(f"Data loaded successfully! Shape: {df.shape}")
except FileNotFoundError:
    print(f"CSV file not found at: {df}")


Project root detected: e:\Credit-Risk-Probability-Model-for-Alternative-Data-wek_4
Data loaded successfully! Shape: (95662, 24)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def preprocess_features(df):
    final_df = df.copy()
    
    # Drop pure ID columns
    df.drop(columns=["TransactionId", "BatchId"], inplace=True)
    
    # Convert datetime to numeric features
    df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])
    df["TransactionHour"] = df["TransactionStartTime"].dt.hour
    df["TransactionDay"] = df["TransactionStartTime"].dt.day
    df["TransactionMonth"] = df["TransactionStartTime"].dt.month
    df["TransactionYear"] = df["TransactionStartTime"].dt.year
    df["TransactionWeekday"] = df["TransactionStartTime"].dt.weekday
    df.drop(columns=["TransactionStartTime"], inplace=True)
    
    # Encode categorical columns
    cat_cols = ["AccountId", "CustomerId", "SubscriptionId", "ProductId",
                "ProductCategory", "ChannelId", "CurrencyCode", "CountryCode",
                "ProviderId", "PricingStrategy"]
    
    le = LabelEncoder()
    for col in cat_cols:
        df[col] = le.fit_transform(df[col])
    
    return df

# Preprocess the data
processed_df = preprocess_features(df)

# Separate features and target
y = processed_df["is_high_risk"]
X = processed_df.drop(columns=["is_high_risk"])

# Split into training/testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from src.train import CreditRiskTrainer
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import mlflow

trainer = CreditRiskTrainer(X, y)

# Add Random Forest with GridSearch
rf_params = {"n_estimators":[100,200], "max_depth":[5,10,None]}
trainer.add_model("RandomForest", RandomForestClassifier(random_state=42), rf_params)

# Add XGBoost without tuning
trainer.add_model("XGBoost", xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
# Train and tune all models

trainer.train_and_tune()

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
mlflow.set_tracking_uri(f"file:///{PROJECT_ROOT}/mlruns")

# Create experiment if it doesn't exist
experiment_name = "Credit_Risk_Experiment"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    mlflow.create_experiment(experiment_name)

mlflow.set_experiment(experiment_name)

# Log experiments to MLflow
trainer.log_experiments()

# Select best model
best_model = trainer.get_best_model()

print(f"Best model is {best_model} with ROC-AUC = {best_model}")


Training model: RandomForest
Metrics for RandomForest: {'accuracy': 0.9975435112109967, 'precision': 0.9858532272325375, 'recall': 0.9933184855233853, 'f1_score': 0.9895717772354116, 'roc_auc': 0.9998572278625377}

Training model: XGBoost




Metrics for XGBoost: {'accuracy': 0.9982752312758062, 'precision': 0.9893805309734514, 'recall': 0.9959910913140312, 'f1_score': 0.9926748057713651, 'roc_auc': 0.9997560239660955}




Logged RandomForest to MLflow with metrics: {'accuracy': 0.9975435112109967, 'precision': 0.9858532272325375, 'recall': 0.9933184855233853, 'f1_score': 0.9895717772354116, 'roc_auc': 0.9998572278625377}
Logged XGBoost to MLflow with metrics: {'accuracy': 0.9982752312758062, 'precision': 0.9893805309734514, 'recall': 0.9959910913140312, 'f1_score': 0.9926748057713651, 'roc_auc': 0.9997560239660955}
Best model is ('RandomForest', RandomForestClassifier(n_estimators=200, random_state=42), 0.9998572278625377) with ROC-AUC = ('RandomForest', RandomForestClassifier(n_estimators=200, random_state=42), 0.9998572278625377)


In [5]:
best_name, best_model, best_metric = trainer.get_best_model()
print(f"Best model is {best_name} with ROC-AUC = {best_metric}")

Best model is RandomForest with ROC-AUC = 0.9998572278625377


In [6]:
trainer.register_best_model(
    metric="roc_auc",
    registered_model_name="CreditRiskModel"
)

Registered model 'CreditRiskModel' already exists. Creating a new version of this model...


Registered 'RandomForest' as 'CreditRiskModel' (metric=0.9999)


Created version '3' of model 'CreditRiskModel'.


In [None]:
import os
import mlflow.sklearn

# Save locally for FastAPI
LOCAL_MODEL_PATH = "models/best_model"
os.makedirs(os.path.dirname(LOCAL_MODEL_PATH), exist_ok=True)  # ensure folder exists
mlflow.sklearn.save_model(best_model, LOCAL_MODEL_PATH)
print(f"Saved best model locally to {LOCAL_MODEL_PATH}")