# Model Training - Credit Card Fraud Detection

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import joblib
import mlflow

In [14]:
# Set MLflow tracking URI to use the same backend as the UI
mlflow.set_tracking_uri("sqlite:///../mlflow.db")
mlflow.set_experiment("fraud_detection")

print("MLflow tracking URI:", mlflow.get_tracking_uri())
print("MLflow experiment:", mlflow.get_experiment_by_name("fraud_detection"))

2026/02/13 23:44:24 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/02/13 23:44:24 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/02/13 23:44:24 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/02/13 23:44:24 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/02/13 23:44:24 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/02/13 23:44:24 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/02/13 23:44:24 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/02/13 23:44:24 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/02/13 23:44:24 INFO mlflow.tracking.fluent: Experiment with name 'fraud_detection' does not exist. Creating a new experiment.


MLflow tracking URI: sqlite:///../mlflow.db
MLflow experiment: <Experiment: artifact_location='/Users/adilg/caio/caio-cc-fraud-detection/notebooks/mlruns/1', creation_time=1771022664644, experiment_id='1', last_update_time=1771022664644, lifecycle_stage='active', name='fraud_detection', tags={}>


In [15]:
# Load processed data
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (1296675, 5)
y_train shape: (1296675,)


In [16]:
# Start MLflow run
with mlflow.start_run(run_name="logistic_regression_baseline"):
    
    # Log parameters
    model_params = {
        "max_iter": 1000,
        "random_state": 42,
        "solver": "lbfgs"
    }
    mlflow.log_params(model_params)
    
    # Train model
    print("Training model...")
    model = LogisticRegression(**model_params)
    model.fit(X_train, y_train)
    print("Model trained!")
    
    # Log training dataset info
    mlflow.log_param("train_samples", len(X_train))
    mlflow.log_param("n_features", X_train.shape[1])
    mlflow.log_param("fraud_rate", y_train.mean())
    
    # Log model with MLflow
    mlflow.sklearn.log_model(
        model, 
        "model",
        registered_model_name="fraud_detector"
    )
    
    print(f"Run ID: {mlflow.active_run().info.run_id}")

Training model...




Model trained!


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


Run ID: 18c56c2a7f134b91a0ae395f9c449375


Successfully registered model 'fraud_detector'.
Created version '1' of model 'fraud_detector'.


In [17]:
# Also save the model locally for backwards compatibility
joblib.dump(model, '../models/fraud_model.pkl')
print("Model also saved to ../models/fraud_model.pkl")

Model also saved to ../models/fraud_model.pkl
