# Baseline Model Training - FeatureForge Phase 1

This notebook trains the baseline CTR prediction model with 15-20 baseline features.

## Goals:
- Create baseline features
- Train/val/test split
- Train XGBoost model with class imbalance handling
- Evaluate comprehensive metrics
- **Establish baseline F1-score** (CONTROL group for A/B testing)
- Analyze feature importance

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F

from src.config import Config
from src.utils.logging_utils import setup_logging
from src.utils.spark_utils import create_spark_session
from src.data.loader import CriteoDataLoader
from src.data.splitter import DataSplitter
from src.features.feature_engine import FeatureEngine
from src.models.trainer import XGBoostTrainer
from src.models.evaluator import ModelEvaluator

# Setup
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Setup and Load Data

In [None]:
# Load configuration
config = Config('../config/config.yaml')

# Setup logging
logger = setup_logging(level='INFO', log_file='../logs/baseline_model.log')

logger.info("Starting baseline model training...")

In [None]:
# Create Spark session
spark = create_spark_session(
    app_name=config['spark']['app_name'],
    master=config['spark']['master'],
    executor_memory=config['spark']['executor_memory'],
    driver_memory=config['spark']['driver_memory']
)

print(f"Spark version: {spark.version}")

In [None]:
# Load data
loader = CriteoDataLoader(spark, config)

# Use sample for faster iteration (or full dataset if available)
sample_path = config['data']['sample_path']
print(f"Loading data from: {sample_path}")

import os
if not os.path.exists(sample_path) or len(os.listdir(sample_path)) == 0:
    print("Sample not found. Loading raw data and creating sample...")
    raw_path = config['data']['raw_path']
    df = loader.load_raw_data(raw_path)
    df = loader.create_sample(df, config['data']['sample_size'], sample_path)
else:
    df = loader.load_parquet(sample_path)

print(f"Data loaded: {df.count():,} rows")
df.show(5)

## 2. Create Baseline Features

In [None]:
# Initialize feature engine
feature_engine = FeatureEngine(config)

# Log feature summary
feature_engine.log_feature_summary()

In [None]:
# Create baseline features
print("Creating baseline features...")
df_features = feature_engine.create_baseline_features(df, is_training=True)

print(f"\nFeatures created. Total columns: {len(df_features.columns)}")
print(f"Feature columns: {df_features.columns}")

## 3. Train/Validation/Test Split

In [None]:
# Initialize data splitter
splitter = DataSplitter(config)

# Split data
train_df, val_df, test_df = splitter.split_data(df_features)

print(f"\nTrain set: {train_df.count():,} rows")
print(f"Validation set: {val_df.count():,} rows")
print(f"Test set: {test_df.count():,} rows")

## 4. Prepare Data for Training (Convert to Pandas)

In [None]:
# Get feature columns (exclude target and original categoricals)
categorical_cols = config['features']['categorical_cols']
target_col = config['features']['target_col']

# Feature columns = all columns except target and original categoricals
feature_cols = [c for c in df_features.columns if c != target_col and c not in categorical_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"\nFeature columns:")
for i, col in enumerate(feature_cols[:20], 1):  # Show first 20
    print(f"  {i}. {col}")
if len(feature_cols) > 20:
    print(f"  ... and {len(feature_cols) - 20} more")

In [None]:
# Convert to Pandas for XGBoost
print("Converting to Pandas...")

# Select feature columns + target
train_pd = train_df.select(feature_cols + [target_col]).toPandas()
val_pd = val_df.select(feature_cols + [target_col]).toPandas()
test_pd = test_df.select(feature_cols + [target_col]).toPandas()

# Split features and target
X_train = train_pd[feature_cols]
y_train = train_pd[target_col]

X_val = val_pd[feature_cols]
y_val = val_pd[target_col]

X_test = test_pd[feature_cols]
y_test = test_pd[target_col]

print(f"\nTraining data: {X_train.shape}")
print(f"Validation data: {X_val.shape}")
print(f"Test data: {X_test.shape}")

## 5. Train XGBoost Model

In [None]:
# Initialize trainer
trainer = XGBoostTrainer(config)

# Train model
print("Training XGBoost model...\n")
model = trainer.train(X_train, y_train, X_val, y_val)

print("\nTraining complete!")

## 6. Make Predictions

In [None]:
# Predictions on validation set
y_val_proba = trainer.predict(X_val)
y_val_pred = trainer.predict_binary(X_val, threshold=0.5)

print(f"Validation predictions: {len(y_val_pred):,}")

# Predictions on test set
y_test_proba = trainer.predict(X_test)
y_test_pred = trainer.predict_binary(X_test, threshold=0.5)

print(f"Test predictions: {len(y_test_pred):,}")

## 7. Evaluate Model Performance

In [None]:
# Initialize evaluator
evaluator = ModelEvaluator(output_dir='../results')

# Evaluate on validation set
print("\n" + "="*60)
print("VALIDATION SET EVALUATION")
print("="*60)
val_metrics = evaluator.evaluate(
    y_val.values,
    y_val_pred,
    y_val_proba,
    dataset_name="Validation"
)

In [None]:
# Evaluate on test set
print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)
test_metrics = evaluator.evaluate(
    y_test.values,
    y_test_pred,
    y_test_proba,
    dataset_name="Test"
)

## 8. Baseline F1-Score (CONTROL Group)

In [None]:
# Extract baseline F1-score
baseline_f1 = test_metrics['f1']

print("\n" + "="*60)
print("üéØ BASELINE MODEL - PHASE 1 RESULTS")
print("="*60)
print(f"\nüìä BASELINE F1-SCORE: {baseline_f1:.4f}")
print(f"\nThis F1-score serves as the CONTROL group for A/B testing.")
print(f"In Phase 2, experimental features will be compared against this baseline.")
print("\n" + "="*60)
print("\nOther Key Metrics:")
print(f"  - AUC-ROC: {test_metrics['auc_roc']:.4f}")
print(f"  - AUC-PR:  {test_metrics['auc_pr']:.4f}")
print(f"  - Precision: {test_metrics['precision']:.4f}")
print(f"  - Recall:    {test_metrics['recall']:.4f}")
print("="*60)

## 9. Visualizations

In [None]:
# Confusion Matrix
evaluator.plot_confusion_matrix(
    test_metrics['confusion_matrix'],
    save_path='../results/baseline_confusion_matrix.png'
)

In [None]:
# ROC Curve
evaluator.plot_roc_curve(
    y_test.values,
    y_test_proba,
    save_path='../results/baseline_roc_curve.png'
)

In [None]:
# Precision-Recall Curve
evaluator.plot_precision_recall_curve(
    y_test.values,
    y_test_proba,
    save_path='../results/baseline_pr_curve.png'
)

In [None]:
# Prediction Distribution
evaluator.plot_prediction_distribution(
    y_test.values,
    y_test_proba,
    save_path='../results/baseline_prediction_dist.png'
)

## 10. Feature Importance Analysis

In [None]:
# Get feature importance
importance_df = trainer.get_feature_importance(importance_type='gain')

print("Top 20 Most Important Features:")
print(importance_df.head(20))

In [None]:
# Plot feature importance
evaluator.plot_feature_importance(
    importance_df,
    top_n=20,
    save_path='../results/baseline_feature_importance.png'
)

In [None]:
# Save feature importance to CSV
importance_df.to_csv('../results/baseline_feature_importance.csv', index=False)
print("Feature importance saved to: ../results/baseline_feature_importance.csv")

## 11. Save Model and Results

In [None]:
# Save model
import os
os.makedirs('../models', exist_ok=True)

model_path = '../models/baseline_xgboost.model'
trainer.save_model(model_path)
print(f"Model saved to: {model_path}")

In [None]:
# Save baseline metrics
import json

baseline_results = {
    'phase': 1,
    'model': 'baseline_xgboost',
    'num_features': len(feature_cols),
    'metrics': {
        'f1': float(baseline_f1),
        'auc_roc': float(test_metrics['auc_roc']),
        'auc_pr': float(test_metrics['auc_pr']),
        'precision': float(test_metrics['precision']),
        'recall': float(test_metrics['recall']),
        'accuracy': float(test_metrics['accuracy']),
        'log_loss': float(test_metrics['log_loss'])
    }
}

results_path = '../results/baseline_results.json'
with open(results_path, 'w') as f:
    json.dump(baseline_results, f, indent=2)

print(f"Baseline results saved to: {results_path}")
print("\nBaseline results:")
print(json.dumps(baseline_results, indent=2))

## 12. Summary

In [None]:
print("\n" + "="*60)
print("‚úÖ PHASE 1 COMPLETE - BASELINE MODEL ESTABLISHED")
print("="*60)
print(f"\nüìä Key Results:")
print(f"  - Baseline F1-Score: {baseline_f1:.4f}")
print(f"  - Number of Features: {len(feature_cols)}")
print(f"  - Training Samples: {len(X_train):,}")
print(f"  - Test Samples: {len(X_test):,}")
print(f"\nüìÅ Outputs:")
print(f"  - Model: {model_path}")
print(f"  - Results: {results_path}")
print(f"  - Visualizations: ../results/")
print(f"\nüöÄ Next Steps (Phase 2):")
print(f"  1. Create 70+ experimental features")
print(f"  2. Compare experimental vs baseline (A/B testing)")
print(f"  3. Statistical significance testing")
print(f"  4. Feature selection and optimization")
print("\n" + "="*60)

In [None]:
# Stop Spark session
spark.stop()
print("\nSpark session stopped.")