# MLflow Discrepancy Prediction Pipeline

This notebook implements an end-to-end ML pipeline to predict the discrepancy between ground and satellite PM2.5 measurements.

## Overview
- **Target**: `target_diff = PM2.5_ground - PM2.5_satellite * scaling_factor`
- **Models**: Linear Regression, Ridge, Lasso, Random Forest, Gradient Boosting, XGBoost
- **Tracking**: MLflow for experiment tracking and model versioning

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import os
warnings.filterwarnings('ignore')

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# MLflow imports
import mlflow
import mlflow.sklearn
from mlflow import log_metric, log_param, log_artifacts

# XGBoost (optional)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available. Skipping XGBoost model.")

# Set random seeds for reproducibility
np.random.seed(42)

# Set up paths
NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent.parent
DATA_PATH = NOTEBOOK_DIR / "cleaned_aqi_merged_dataset.csv"
MODELS_DIR = PROJECT_ROOT / "models"
MODELS_DIR.mkdir(exist_ok=True)

# MLflow setup
MLFLOW_TRACKING_URI = PROJECT_ROOT / "mlruns"
os.makedirs(MLFLOW_TRACKING_URI, exist_ok=True)
mlflow.set_tracking_uri(str(MLFLOW_TRACKING_URI))
mlflow.set_experiment("PM2.5_Discrepancy_Prediction")

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DATA_PATH}")
print(f"Models directory: {MODELS_DIR}")
print(f"MLflow tracking URI: {MLFLOW_TRACKING_URI}")


## 1. Load Data


In [None]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# Check for missing values
print(f"\nMissing values per column:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found.")

# Display basic statistics
print(f"\nBasic statistics:")
print(df.describe())


## 2. Prepare Data & Create Target Variable


In [None]:
# Identify PM2.5 columns (flexible naming)
ground_col = None
satellite_col = None
scaling_factor_col = None

# Try to find ground PM2.5 column
for col in df.columns:
    if 'pm25_ground' in col.lower() or 'pm2.5_ground' in col.lower():
        ground_col = col
        break

# Try to find satellite PM2.5 column
for col in df.columns:
    if 'pm25_satellite' in col.lower() or 'pm2.5_satellite' in col.lower():
        satellite_col = col
        break

# If satellite PM2.5 not found, use Aerosol_Index as proxy
if satellite_col is None:
    if 'Aerosol_Index_satellite' in df.columns:
        satellite_col = 'Aerosol_Index_satellite'
        print("‚ö†Ô∏è  PM2.5_satellite column not found. Using Aerosol_Index_satellite as proxy.")
    else:
        print("‚ùå ERROR: No satellite PM2.5 or Aerosol_Index column found!")
        raise ValueError("Cannot find satellite PM2.5 data")

# Check for scaling factor
for col in df.columns:
    if 'scaling_factor' in col.lower():
        scaling_factor_col = col
        break

print(f"\nGround PM2.5 column: {ground_col}")
print(f"Satellite PM2.5 column: {satellite_col}")
print(f"Scaling factor column: {scaling_factor_col if scaling_factor_col else 'None (using default=1)'}")

# Create target variable
if scaling_factor_col:
    df["target_diff"] = df[ground_col] - (df[satellite_col] * df[scaling_factor_col])
else:
    # If using Aerosol_Index, we need a reasonable scaling (AOD typically ranges -2 to 2)
    # For PM2.5, a rough conversion: PM2.5 ‚âà AOD * 50-100 (this is approximate)
    if satellite_col == 'Aerosol_Index_satellite':
        # Use a scaling factor to convert AOD to approximate PM2.5
        # This is a rough estimate - adjust based on domain knowledge
        scaling_factor = 50.0  # Approximate conversion factor
        df["target_diff"] = df[ground_col] - (df[satellite_col] * scaling_factor)
        print(f"Using scaling factor {scaling_factor} for Aerosol_Index to PM2.5 conversion")
    else:
        df["target_diff"] = df[ground_col] - df[satellite_col]

print(f"\nTarget variable statistics:")
print(df["target_diff"].describe())

# Drop unrealistic values
print("\nFiltering unrealistic values...")
initial_shape = df.shape[0]
df = df[(df[ground_col] >= -10) & (df[ground_col] <= 1000)]
if satellite_col != 'Aerosol_Index_satellite':
    df = df[(df[satellite_col] >= -10) & (df[satellite_col] <= 1000)]
df = df[df["target_diff"].notna()]
final_shape = df.shape[0]
print(f"Removed {initial_shape - final_shape} rows with unrealistic values")
print(f"Final dataset shape: {df.shape}")


## 3. Feature Engineering & Preprocessing


In [None]:
# Identify feature columns
# Exclude target, date, location (we'll encode location), and other non-feature columns
exclude_cols = ['date', 'target_diff', ground_col, satellite_col, 'notes']
if scaling_factor_col:
    exclude_cols.append(scaling_factor_col)

# Separate features and target
feature_cols = [col for col in df.columns if col not in exclude_cols]
X = df[feature_cols].copy()
y = df["target_diff"].copy()

print(f"Feature columns ({len(feature_cols)}):")
print(feature_cols)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")

# Handle missing values
print(f"\nMissing values in features:")
missing_counts = X.isnull().sum()
if missing_counts.sum() > 0:
    print(missing_counts[missing_counts > 0])
    # Fill numerical with median, categorical with mode
    for col in numerical_cols:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)
    for col in categorical_cols:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 'Unknown', inplace=True)
    print("Missing values filled.")
else:
    print("No missing values found.")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

print("\nPreprocessing pipeline created.")


## 4. Train-Test Split


In [None]:
# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")

# Fit preprocessor on training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"\nProcessed training set shape: {X_train_processed.shape}")
print(f"Processed test set shape: {X_test_processed.shape}")


## 5. Model Training with MLflow Tracking


In [None]:
# Enable MLflow autologging
mlflow.sklearn.autolog()

# Define models to train
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=1.0, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

if XGBOOST_AVAILABLE:
    models['XGBoost'] = xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Store results
results = []

print("Starting model training with MLflow tracking...\n")


In [None]:
# Train each model
for model_name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print(f"{'='*60}")
    
    with mlflow.start_run(run_name=model_name):
        # Log model name
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("train_size", len(X_train))
        mlflow.log_param("test_size", len(X_test))
        
        # Train model
        model.fit(X_train_processed, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train_processed)
        y_test_pred = model.predict(X_test_processed)
        
        # Calculate metrics
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Log metrics
        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("train_rmse", train_rmse)
        mlflow.log_metric("train_r2", train_r2)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("test_rmse", test_rmse)
        mlflow.log_metric("test_r2", test_r2)
        
        # Log feature importance if available
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            feature_names = preprocessor.get_feature_names_out()
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importances
            }).sort_values('importance', ascending=False)
            
            # Log top 10 features
            top_features = importance_df.head(10)
            for idx, row in top_features.iterrows():
                mlflow.log_metric(f"feature_importance_{row['feature']}", row['importance'])
        
        # Create and save plots
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Residual plot
        residuals = y_test - y_test_pred
        axes[0].scatter(y_test_pred, residuals, alpha=0.5)
        axes[0].axhline(y=0, color='r', linestyle='--')
        axes[0].set_xlabel('Predicted Values')
        axes[0].set_ylabel('Residuals')
        axes[0].set_title(f'{model_name} - Residual Plot')
        axes[0].grid(True, alpha=0.3)
        
        # Predicted vs Actual
        axes[1].scatter(y_test, y_test_pred, alpha=0.5)
        min_val = min(y_test.min(), y_test_pred.min())
        max_val = max(y_test.max(), y_test_pred.max())
        axes[1].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
        axes[1].set_xlabel('Actual Values')
        axes[1].set_ylabel('Predicted Values')
        axes[1].set_title(f'{model_name} - Predicted vs Actual')
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        
        # Save plots
        plot_path = f"plots_{model_name}.png"
        plt.savefig(plot_path, dpi=150, bbox_inches='tight')
        mlflow.log_artifact(plot_path)
        plt.close()
        
        # Save model
        mlflow.sklearn.log_model(model, "model")
        
        # Store results
        results.append({
            'model': model_name,
            'train_mae': train_mae,
            'train_rmse': train_rmse,
            'train_r2': train_r2,
            'test_mae': test_mae,
            'test_rmse': test_rmse,
            'test_r2': test_r2
        })
        
        print(f"‚úì {model_name} trained successfully")
        print(f"  Test MAE: {test_mae:.4f}")
        print(f"  Test RMSE: {test_rmse:.4f}")
        print(f"  Test R¬≤: {test_r2:.4f}")

print(f"\n{'='*60}")
print("All models trained successfully!")
print(f"{'='*60}")


## 6. Model Comparison & Selection


In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('test_rmse')

print("Model Performance Summary:")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

# Select best model (lowest RMSE)
best_model_name = results_df.iloc[0]['model']
best_model = models[best_model_name]

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test RMSE: {results_df.iloc[0]['test_rmse']:.4f}")
print(f"   Test MAE: {results_df.iloc[0]['test_mae']:.4f}")
print(f"   Test R¬≤: {results_df.iloc[0]['test_r2']:.4f}")

# Save results to CSV
results_csv_path = PROJECT_ROOT / "model_results.csv"
results_df.to_csv(results_csv_path, index=False)
print(f"\n‚úì Results saved to: {results_csv_path}")


## 7. Save Best Model


## 8. Prediction Function (Optional - for making predictions on new data)


In [None]:
# Function to predict discrepancy and corrected PM2.5
def predict_discrepancy(model_pipeline, features_df):
    """
    Predict the discrepancy between ground and satellite PM2.5.
    
    Args:
        model_pipeline: Trained model pipeline (preprocessor + model)
        features_df: DataFrame with feature columns
        
    Returns:
        Array of predicted discrepancies
    """
    return model_pipeline.predict(features_df)


def predict_corrected_pm25(model_pipeline, features_df, satellite_value, scaling_factor=1.0):
    """
    Predict corrected PM2.5 using satellite value and predicted discrepancy.
    
    Formula: corrected_pm25 = satellite_value * scaling_factor + predicted_difference
    
    Args:
        model_pipeline: Trained model pipeline
        features_df: DataFrame with feature columns
        satellite_value: Satellite PM2.5 or AOD value
        scaling_factor: Scaling factor for satellite value (default: 1.0)
        
    Returns:
        Array of corrected PM2.5 values
    """
    predicted_diff = predict_discrepancy(model_pipeline, features_df)
    corrected_pm25 = (satellite_value * scaling_factor) + predicted_diff
    return corrected_pm25


# Example: Load saved model and make predictions
# Uncomment and modify as needed:
"""
import joblib

# Load the saved model
loaded_model = joblib.load(best_model_path)

# Example: Predict on new data
# new_data = pd.DataFrame({
#     'NO2_satellite': [0.0002],
#     'SO2_satellite': [-0.0004],
#     'CO_satellite': [0.05],
#     'O3_satellite': [0.16],
#     'Aerosol_Index_satellite': [-1.0],
#     'location': ['Anand Vihar, Delhi'],
#     'month': [1],
#     'day_of_week': [2],
#     'season': ['Winter'],
#     # ... add all required features
# })

# predicted_diff = predict_discrepancy(loaded_model, new_data)
# corrected_pm25 = predict_corrected_pm25(loaded_model, new_data, -1.0, scaling_factor=50.0)

# print(f"Predicted discrepancy: {predicted_diff[0]:.2f}")
# print(f"Corrected PM2.5: {corrected_pm25[0]:.2f}")
"""

print("Prediction functions defined. Uncomment the example code above to use them.")


In [None]:
# Create a fresh instance of the best model (since previous one was trained on processed data)
model_classes = {
    'LinearRegression': LinearRegression,
    'Ridge': lambda: Ridge(alpha=1.0, random_state=42),
    'Lasso': lambda: Lasso(alpha=1.0, random_state=42),
    'RandomForest': lambda: RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': lambda: GradientBoostingRegressor(n_estimators=100, random_state=42),
}

if XGBOOST_AVAILABLE:
    model_classes['XGBoost'] = lambda: xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create a fresh model instance
fresh_best_model = model_classes[best_model_name]()

# Create a complete pipeline (preprocessor + model)
best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', fresh_best_model)
])

# Retrain on full training data
best_pipeline.fit(X_train, y_train)

# Save the best model using MLflow
with mlflow.start_run(run_name=f"{best_model_name}_final"):
    mlflow.log_param("model_name", f"{best_model_name}_final")
    mlflow.log_param("best_model", best_model_name)
    
    # Evaluate on test set
    y_test_pred = best_pipeline.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)
    
    # Save the complete pipeline
    mlflow.sklearn.log_model(best_pipeline, "best_model")
    
    # Also save to local directory
    import joblib
    best_model_path = MODELS_DIR / "best_model.pkl"
    joblib.dump(best_pipeline, best_model_path)
    print(f"‚úì Best model saved to: {best_model_path}")

print(f"\n‚úì Final model saved using MLflow")
print(f"  Model: {best_model_name}")
print(f"  Test RMSE: {test_rmse:.4f}")
print(f"  Test MAE: {test_mae:.4f}")
print(f"  Test R¬≤: {test_r2:.4f}")
