# MLflow Discrepancy Prediction Pipeline

This notebook implements an end-to-end ML pipeline to predict the discrepancy between ground and satellite PM2.5 measurements.

## Overview
- **Target**: `target_diff = PM2.5_ground - PM2.5_satellite * scaling_factor`
- **Models**: Linear Regression, Ridge, Lasso, Random Forest, Gradient Boosting, XGBoost
- **Tracking**: MLflow for experiment tracking and model versioning

In [1]:
%pip install matplotlib seaborn mlflow xgboost

Collecting matplotlib
  Downloading matplotlib-3.10.7-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting mlflow
  Downloading mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.1-cp313-cp313-win_amd64.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp313-cp313-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-12.0.0-cp313-cp313-win_amd64.whl.metadata (9.0 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.2.5-p

In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import os
warnings.filterwarnings('ignore')

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# MLflow imports
import mlflow
import mlflow.sklearn
from mlflow import log_metric, log_param, log_artifacts

# XGBoost (optional)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available. Skipping XGBoost model.")

# Set random seeds for reproducibility
np.random.seed(42)

# Set up paths
NOTEBOOK_DIR = Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent.parent
DATA_PATH = NOTEBOOK_DIR / "cleaned_aqi_merged_dataset.csv"
MODELS_DIR = PROJECT_ROOT / "models"
MODELS_DIR.mkdir(exist_ok=True)

# MLflow setup
MLFLOW_TRACKING_URI = PROJECT_ROOT / "mlruns"
os.makedirs(MLFLOW_TRACKING_URI, exist_ok=True)
# Convert to file URI format for Windows compatibility
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI.as_uri())
mlflow.set_experiment("PM2.5_Discrepancy_Prediction")

print(f"Project root: {PROJECT_ROOT}")
print(f"Data path: {DATA_PATH}")
print(f"Models directory: {MODELS_DIR}")
print(f"MLflow tracking URI: {MLFLOW_TRACKING_URI}")


2025/11/21 17:49:42 INFO mlflow.tracking.fluent: Experiment with name 'PM2.5_Discrepancy_Prediction' does not exist. Creating a new experiment.


Project root: C:\Users\robot\Desktop\Delhi-NCR-AQI-Assessment
Data path: C:\Users\robot\Desktop\Delhi-NCR-AQI-Assessment\notebooks\model-training\cleaned_aqi_merged_dataset.csv
Models directory: C:\Users\robot\Desktop\Delhi-NCR-AQI-Assessment\models
MLflow tracking URI: C:\Users\robot\Desktop\Delhi-NCR-AQI-Assessment\mlruns


## 1. Load Data


In [5]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())

# Check for missing values
print(f"\nMissing values per column:")
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values found.")

# Display basic statistics
print(f"\nBasic statistics:")
print(df.describe())


Loading dataset...
Dataset shape: (16936, 34)

Columns: ['date', 'NO2_satellite', 'SO2_satellite', 'CO_satellite', 'O3_satellite', 'Aerosol_Index_satellite', 'location', 'PM2.5_ground', 'PM10_ground', 'NO2_ground', 'SO2_ground', 'CO_ground', 'O3_ground', 'lat', 'lon', 'notes', 'distance_to_major_road', 'total_road_length_m', 'major_road_length_m', 'pct_green', 'pct_industrial', 'pct_residential', 'building_density', 'avg_building_area_m2', 'median_building_area_m2', 'building_count', 'major_road_fraction', 'month', 'day_of_week', 'season', 'NO2_ratio', 'SO2_ratio', 'CO_ratio', 'O3_ratio']

First few rows:
         date  NO2_satellite  SO2_satellite  CO_satellite  O3_satellite  \
0  2020-01-01       0.000191      -0.000433      0.048550      0.164568   
1  2020-01-02       0.000191      -0.000433      0.048550      0.164568   
2  2020-01-03       0.000143       0.000546      0.041712      0.139937   
3  2020-01-04       0.000143      -0.000315      0.044633      0.137429   
4  2020-01-0

## 2. Prepare Data & Create Target Variable


In [6]:
# Identify PM2.5 columns (flexible naming)
ground_col = None
satellite_col = None
scaling_factor_col = None

# Try to find ground PM2.5 column
# PM2.5_ground

for col in df.columns:
    if 'pm2.5_ground' in col.lower():
        ground_col = col
        break

# Use Aerosol_Index_satellite as proxy for satellite PM2.5
# Note: Satellite data does NOT have PM2.5/PM10 - these are ground-exclusive metrics
# Satellite data only has: NO2, SO2, CO, O3, and Aerosol_Index
if 'Aerosol_Index_satellite' in df.columns:
    satellite_col = 'Aerosol_Index_satellite'
    print("‚ÑπÔ∏è  Using Aerosol_Index_satellite as proxy for satellite PM2.5")
    print("   (Satellite data does not have direct PM2.5 measurements - PM2.5/PM10 are ground-exclusive)")
else:
    raise ValueError("‚ùå ERROR: Aerosol_Index_satellite column not found!")

# Check for scaling factor
for col in df.columns:
    if 'scaling_factor' in col.lower():
        scaling_factor_col = col
        break

print(f"\nGround PM2.5 column: {ground_col}")
print(f"Satellite PM2.5 column: {satellite_col}")
print(f"Scaling factor column: {scaling_factor_col if scaling_factor_col else 'None (using default=1)'}")

# Create target variable
if scaling_factor_col:
    df["target_diff"] = df[ground_col] - (df[satellite_col] * df[scaling_factor_col])
else:
    # If using Aerosol_Index, we need a reasonable scaling (AOD typically ranges -2 to 2)
    # For PM2.5, a rough conversion: PM2.5 ‚âà AOD * 50-100 (this is approximate)
    if satellite_col == 'Aerosol_Index_satellite':
        # Use a scaling factor to convert AOD to approximate PM2.5
        # This is a rough estimate - adjust based on domain knowledge
        scaling_factor = 50.0  # Approximate conversion factor
        df["target_diff"] = df[ground_col] - (df[satellite_col] * scaling_factor)
        print(f"Using scaling factor {scaling_factor} for Aerosol_Index to PM2.5 conversion")
    else:
        df["target_diff"] = df[ground_col] - df[satellite_col]

print(f"\nTarget variable statistics:")
print(df["target_diff"].describe())

# Drop unrealistic values
print("\nFiltering unrealistic values...")
initial_shape = df.shape[0]
df = df[(df[ground_col] >= -10) & (df[ground_col] <= 1000)]
if satellite_col != 'Aerosol_Index_satellite':
    df = df[(df[satellite_col] >= -10) & (df[satellite_col] <= 1000)]
df = df[df["target_diff"].notna()]
final_shape = df.shape[0]
print(f"Removed {initial_shape - final_shape} rows with unrealistic values")
print(f"Final dataset shape: {df.shape}")


‚ÑπÔ∏è  Using Aerosol_Index_satellite as proxy for satellite PM2.5
   (Satellite data does not have direct PM2.5 measurements - PM2.5/PM10 are ground-exclusive)

Ground PM2.5 column: PM2.5_ground
Satellite PM2.5 column: Aerosol_Index_satellite
Scaling factor column: None (using default=1)
Using scaling factor 50.0 for Aerosol_Index to PM2.5 conversion

Target variable statistics:
count    16936.000000
mean       106.884137
std         78.310584
min       -153.024503
25%         57.002818
50%         92.474619
75%        138.477370
max        643.586179
Name: target_diff, dtype: float64

Filtering unrealistic values...
Removed 0 rows with unrealistic values
Final dataset shape: (16936, 35)


## 3. Feature Engineering & Preprocessing


In [None]:
# Create a complete pipeline with preprocessor and best model
import joblib

# Get the best model class
model_classes = {
    'LinearRegression': LinearRegression,
    'Ridge': lambda: Ridge(alpha=1.0, random_state=42),
    'Lasso': lambda: Lasso(alpha=1.0, random_state=42),
    'RandomForest': lambda: RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': lambda: GradientBoostingRegressor(n_estimators=100, random_state=42),
}

if XGBOOST_AVAILABLE:
    model_classes['XGBoost'] = lambda: xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create a fresh instance of the best model
fresh_best_model = model_classes[best_model_name]()

# Create a complete pipeline (preprocessor + model)
best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', fresh_best_model)
])

# Retrain on full training data
print(f"Retraining {best_model_name} on full training set...")
best_pipeline.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = best_pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print(f"\n‚úì Pipeline trained successfully")
print(f"  Test RMSE: {test_rmse:.4f}")
print(f"  Test MAE: {test_mae:.4f}")
print(f"  Test R¬≤: {test_r2:.4f}")

# Save the complete pipeline to disk
best_model_path = MODELS_DIR / "best_model_pipeline.pkl"
joblib.dump(best_pipeline, best_model_path)
print(f"\n‚úì Best model pipeline saved to: {best_model_path}")

# Also save metadata for easy loading
metadata = {
    'model_name': best_model_name,
    'feature_columns': feature_cols,
    'categorical_columns': categorical_cols,
    'numerical_columns': numerical_cols,
    'ground_col': ground_col,
    'satellite_col': satellite_col,
    'scaling_factor': 50.0 if satellite_col == 'Aerosol_Index_satellite' else 1.0,
    'test_rmse': float(test_rmse),
    'test_mae': float(test_mae),
    'test_r2': float(test_r2)
}

import json
metadata_path = MODELS_DIR / "model_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"‚úì Model metadata saved to: {metadata_path}")

# Save using MLflow as well
with mlflow.start_run(run_name=f"{best_model_name}_final"):
    mlflow.log_param("model_name", f"{best_model_name}_final")
    mlflow.log_param("best_model", best_model_name)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)
    mlflow.sklearn.log_model(best_pipeline, "best_model")
    print(f"‚úì Model also saved to MLflow")


## 9. Load and Use Saved Model (Example)


In [None]:
# Example: How to load and use the saved model in your code
import joblib
import json
import pandas as pd
from pathlib import Path

# Paths (works in both notebook and script)
try:
    # In notebook
    MODELS_DIR = PROJECT_ROOT / "models"
except NameError:
    # In standalone script
    PROJECT_ROOT = Path(__file__).parent.parent
    MODELS_DIR = PROJECT_ROOT / "models"
MODEL_PATH = MODELS_DIR / "best_model_pipeline.pkl"
METADATA_PATH = MODELS_DIR / "model_metadata.json"

# Load the model pipeline
print("Loading saved model...")
loaded_pipeline = joblib.load(MODEL_PATH)
print("‚úì Model loaded successfully!")

# Load metadata
with open(METADATA_PATH, 'r') as f:
    metadata = json.load(f)
print(f"\nModel Info:")
print(f"  Model Type: {metadata['model_name']}")
print(f"  Test RMSE: {metadata['test_rmse']:.4f}")
print(f"  Test R¬≤: {metadata['test_r2']:.4f}")

# Example: Make predictions on new data
def predict_discrepancy(features_df):
    """
    Predict discrepancy between ground and satellite PM2.5.
    
    Args:
        features_df: DataFrame with all required feature columns
        
    Returns:
        Array of predicted discrepancies
    """
    return loaded_pipeline.predict(features_df)


def predict_corrected_pm25(features_df, satellite_value):
    """
    Predict corrected PM2.5 using satellite value and predicted discrepancy.
    
    Args:
        features_df: DataFrame with all required feature columns
        satellite_value: Satellite AOD value (Aerosol_Index_satellite)
        
    Returns:
        Array of corrected PM2.5 values
    """
    predicted_diff = predict_discrepancy(features_df)
    scaling_factor = metadata['scaling_factor']
    corrected_pm25 = (satellite_value * scaling_factor) + predicted_diff
    return corrected_pm25


# Example usage with sample data
print("\n" + "="*60)
print("Example: Making predictions on new data")
print("="*60)

# Create sample feature data (you would replace this with your actual data)
sample_data = pd.DataFrame({
    'NO2_satellite': [0.000191],
    'SO2_satellite': [-0.000433],
    'CO_satellite': [0.048550],
    'O3_satellite': [0.164568],
    'location': ['Anand Vihar, Delhi'],
    'PM10_ground': [449.58],
    'NO2_ground': [54.76],
    'SO2_ground': [10.5],
    'CO_ground': [2.1],
    'O3_ground': [60.0],
    'lat': [28.65],
    'lon': [77.31],
    'distance_to_major_road': [150.0],
    'total_road_length_m': [5000.0],
    'major_road_length_m': [1000.0],
    'pct_green': [15.5],
    'pct_industrial': [20.0],
    'pct_residential': [40.0],
    'building_density': [0.8],
    'avg_building_area_m2': [800.0],
    'median_building_area_m2': [350.0],
    'building_count': [500.0],
    'major_road_fraction': [0.19],
    'month': [1],
    'day_of_week': [2],
    'season': ['Winter'],
    'NO2_ratio': [286676.0],
    'SO2_ratio': [-24850.0],
    'CO_ratio': [75.0],
    'O3_ratio': [365.0]
})

# Ensure all required columns are present
required_cols = metadata['feature_columns']
for col in required_cols:
    if col not in sample_data.columns:
        print(f"Warning: Missing column {col}, using default value")
        sample_data[col] = 0  # or use appropriate default

# Reorder columns to match training data
sample_data = sample_data[required_cols]

# Make predictions
satellite_aod = -1.098919  # Example AOD value
predicted_diff = predict_discrepancy(sample_data)
corrected_pm25 = predict_corrected_pm25(sample_data, satellite_aod)

print(f"\nPredicted Discrepancy: {predicted_diff[0]:.2f}")
print(f"Satellite AOD: {satellite_aod}")
print(f"Corrected PM2.5: {corrected_pm25[0]:.2f}")
print(f"\n‚úì Prediction complete!")

print("\n" + "="*60)
print("To use in your own code:")
print("="*60)
print("""
# In your Python script:
import joblib
from pathlib import Path

MODEL_PATH = Path("models/best_model_pipeline.pkl")
model = joblib.load(MODEL_PATH)

# Prepare your data as a DataFrame with all feature columns
# Then simply call:
predictions = model.predict(your_dataframe)
""")


In [8]:
# Identify feature columns
# Exclude target, date, location (we'll encode location), and other non-feature columns
exclude_cols = ['date', 'target_diff', ground_col, satellite_col, 'notes']
if scaling_factor_col:
    exclude_cols.append(scaling_factor_col)

# Separate features and target
feature_cols = [col for col in df.columns if col not in exclude_cols]
X = df[feature_cols].copy()
y = df["target_diff"].copy()

print(f"Feature columns ({len(feature_cols)}):")
print(feature_cols)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")

# Handle missing values
print(f"\nMissing values in features:")
missing_counts = X.isnull().sum()
if missing_counts.sum() > 0:
    print(missing_counts[missing_counts > 0])
    # Fill numerical with median, categorical with mode
    for col in numerical_cols:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)
    for col in categorical_cols:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].mode()[0] if len(X[col].mode()) > 0 else 'Unknown', inplace=True)
    print("Missing values filled.")
else:
    print("No missing values found.")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)

print("\nPreprocessing pipeline created.")


Feature columns (30):
['NO2_satellite', 'SO2_satellite', 'CO_satellite', 'O3_satellite', 'location', 'PM10_ground', 'NO2_ground', 'SO2_ground', 'CO_ground', 'O3_ground', 'lat', 'lon', 'distance_to_major_road', 'total_road_length_m', 'major_road_length_m', 'pct_green', 'pct_industrial', 'pct_residential', 'building_density', 'avg_building_area_m2', 'median_building_area_m2', 'building_count', 'major_road_fraction', 'month', 'day_of_week', 'season', 'NO2_ratio', 'SO2_ratio', 'CO_ratio', 'O3_ratio']

Categorical columns (2): ['location', 'season']
Numerical columns (28): ['NO2_satellite', 'SO2_satellite', 'CO_satellite', 'O3_satellite', 'PM10_ground', 'NO2_ground', 'SO2_ground', 'CO_ground', 'O3_ground', 'lat', 'lon', 'distance_to_major_road', 'total_road_length_m', 'major_road_length_m', 'pct_green', 'pct_industrial', 'pct_residential', 'building_density', 'avg_building_area_m2', 'median_building_area_m2', 'building_count', 'major_road_fraction', 'month', 'day_of_week', 'NO2_ratio', 'SO2

## 4. Train-Test Split


In [9]:
# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test target shape: {y_test.shape}")

# Fit preprocessor on training data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"\nProcessed training set shape: {X_train_processed.shape}")
print(f"Processed test set shape: {X_test_processed.shape}")


Training set shape: (13548, 30)
Test set shape: (3388, 30)
Training target shape: (13548,)
Test target shape: (3388,)

Processed training set shape: (13548, 38)
Processed test set shape: (3388, 38)


## 5. Model Training with MLflow Tracking


In [10]:
# Enable MLflow autologging
mlflow.sklearn.autolog()

# Define models to train
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=1.0, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

if XGBOOST_AVAILABLE:
    models['XGBoost'] = xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Store results
results = []

print("Starting model training with MLflow tracking...\n")


Starting model training with MLflow tracking...



In [12]:
# Train each model
for model_name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print(f"{'='*60}")
    
    with mlflow.start_run(run_name=model_name):
        # Log model name
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("train_size", len(X_train))
        mlflow.log_param("test_size", len(X_test))
        
        # Train model
        model.fit(X_train_processed, y_train)
        
        # Predictions
        y_train_pred = model.predict(X_train_processed)
        y_test_pred = model.predict(X_test_processed)
        
        # Calculate metrics
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        train_r2 = r2_score(y_train, y_train_pred)
        
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Log metrics
        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("train_rmse", train_rmse)
        mlflow.log_metric("train_r2", train_r2)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("test_rmse", test_rmse)
        mlflow.log_metric("test_r2", test_r2)
        
        # Log feature importance if available
        if hasattr(model, 'feature_importances_'):
            importances = model.feature_importances_
            feature_names = preprocessor.get_feature_names_out()
            importance_df = pd.DataFrame({
                'feature': feature_names,
                'importance': importances
            }).sort_values('importance', ascending=False)
            
            # Log top 10 features
            top_features = importance_df.head(10)
            for idx, row in top_features.iterrows():
                # Sanitize feature name for MLflow (replace invalid characters)
                # MLflow allows: alphanumerics, underscores, dashes, periods, spaces, and slashes
                feature_name = str(row['feature'])
                # Replace commas and other invalid chars with underscores
                sanitized_name = feature_name.replace(',', '_').replace('(', '_').replace(')', '_')
                sanitized_name = sanitized_name.replace('[', '_').replace(']', '_').replace('{', '_').replace('}', '_')
                sanitized_name = sanitized_name.replace(':', '_').replace(';', '_').replace('=', '_')
                # Replace multiple underscores with single underscore
                while '__' in sanitized_name:
                    sanitized_name = sanitized_name.replace('__', '_')
                mlflow.log_metric(f"feature_importance_{sanitized_name}", row['importance'])
        
        # Create and save plots
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        # Residual plot
        residuals = y_test - y_test_pred
        axes[0].scatter(y_test_pred, residuals, alpha=0.5)
        axes[0].axhline(y=0, color='r', linestyle='--')
        axes[0].set_xlabel('Predicted Values')
        axes[0].set_ylabel('Residuals')
        axes[0].set_title(f'{model_name} - Residual Plot')
        axes[0].grid(True, alpha=0.3)
        
        # Predicted vs Actual
        axes[1].scatter(y_test, y_test_pred, alpha=0.5)
        min_val = min(y_test.min(), y_test_pred.min())
        max_val = max(y_test.max(), y_test_pred.max())
        axes[1].plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)
        axes[1].set_xlabel('Actual Values')
        axes[1].set_ylabel('Predicted Values')
        axes[1].set_title(f'{model_name} - Predicted vs Actual')
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        
        # Save plots
        plot_path = f"plots_{model_name}.png"
        plt.savefig(plot_path, dpi=150, bbox_inches='tight')
        mlflow.log_artifact(plot_path)
        plt.close()
        
        # Save model
        mlflow.sklearn.log_model(model, "model")
        
        # Store results
        results.append({
            'model': model_name,
            'train_mae': train_mae,
            'train_rmse': train_rmse,
            'train_r2': train_r2,
            'test_mae': test_mae,
            'test_rmse': test_rmse,
            'test_r2': test_r2
        })
        
        print(f"‚úì {model_name} trained successfully")
        print(f"  Test MAE: {test_mae:.4f}")
        print(f"  Test RMSE: {test_rmse:.4f}")
        print(f"  Test R¬≤: {test_r2:.4f}")

print(f"\n{'='*60}")
print("All models trained successfully!")
print(f"{'='*60}")





Training LinearRegression...




‚úì LinearRegression trained successfully
  Test MAE: 44.6432
  Test RMSE: 57.9994
  Test R¬≤: 0.4885

Training Ridge...




‚úì Ridge trained successfully
  Test MAE: 44.6425
  Test RMSE: 57.9994
  Test R¬≤: 0.4885

Training Lasso...




‚úì Lasso trained successfully
  Test MAE: 44.5995
  Test RMSE: 58.2992
  Test R¬≤: 0.4832

Training RandomForest...




‚úì RandomForest trained successfully
  Test MAE: 29.1926
  Test RMSE: 39.3684
  Test R¬≤: 0.7643

Training GradientBoosting...




‚úì GradientBoosting trained successfully
  Test MAE: 34.3096
  Test RMSE: 45.2439
  Test R¬≤: 0.6887

Training XGBoost...




‚úì XGBoost trained successfully
  Test MAE: 30.3293
  Test RMSE: 40.2996
  Test R¬≤: 0.7530

All models trained successfully!


## 6. Model Comparison & Selection


In [13]:
# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('test_rmse')

print("Model Performance Summary:")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

# Select best model (lowest RMSE)
best_model_name = results_df.iloc[0]['model']
best_model = models[best_model_name]

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test RMSE: {results_df.iloc[0]['test_rmse']:.4f}")
print(f"   Test MAE: {results_df.iloc[0]['test_mae']:.4f}")
print(f"   Test R¬≤: {results_df.iloc[0]['test_r2']:.4f}")

# Save results to CSV
results_csv_path = PROJECT_ROOT / "model_results.csv"
results_df.to_csv(results_csv_path, index=False)
print(f"\n‚úì Results saved to: {results_csv_path}")


Model Performance Summary:
           model  train_mae  train_rmse  train_r2  test_mae  test_rmse  test_r2
    RandomForest  11.026185   14.973852  0.962759 29.192581  39.368394 0.764320
    RandomForest  11.026185   14.973852  0.962759 29.192581  39.368394 0.764320
         XGBoost  17.777633   23.455856  0.908619 30.329279  40.299589 0.753039
GradientBoosting  33.976017   43.797367  0.681397 34.309580  45.243949 0.688722
           Ridge  43.955502   56.693876  0.466141 44.642529  57.999395 0.488467
           Ridge  43.955502   56.693876  0.466141 44.642529  57.999395 0.488467
LinearRegression  43.956353   56.693872  0.466141 44.643166  57.999447 0.488466
LinearRegression  43.956353   56.693872  0.466141 44.643166  57.999447 0.488466
           Lasso  43.876452   57.074207  0.458954 44.599535  58.299171 0.483165
           Lasso  43.876452   57.074207  0.458954 44.599535  58.299171 0.483165

üèÜ Best Model: RandomForest
   Test RMSE: 39.3684
   Test MAE: 29.1926
   Test R¬≤: 0.7643

## 7. Save Best Model


In [None]:
# Save all trained models as complete pipelines (preprocessor + model)
import joblib
import json

# Get the model classes
model_classes = {
    'LinearRegression': LinearRegression,
    'Ridge': lambda: Ridge(alpha=1.0, random_state=42),
    'Lasso': lambda: Lasso(alpha=1.0, random_state=42),
    'RandomForest': lambda: RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': lambda: GradientBoostingRegressor(n_estimators=100, random_state=42),
}

if XGBOOST_AVAILABLE:
    model_classes['XGBoost'] = lambda: xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Get all model names that were trained
all_model_names = list(model_classes.keys())

print(f"Saving {len(all_model_names)} models as complete pipelines...")
print("="*60)

# Store all saved models info
saved_models_info = {}
all_models_metadata = {
    'feature_columns': feature_cols,
    'categorical_columns': categorical_cols,
    'numerical_columns': numerical_cols,
    'ground_col': ground_col,
    'satellite_col': satellite_col,
    'scaling_factor': 50.0 if satellite_col == 'Aerosol_Index_satellite' else 1.0,
    'models': {}
}

# Save each model as a complete pipeline
for model_name in all_model_names:
    print(f"\nProcessing {model_name}...")
    
    # Create a fresh instance of the model
    fresh_model = model_classes[model_name]()
    
    # Create a complete pipeline (preprocessor + model)
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', fresh_model)
    ])
    
    # Retrain on full training data
    print(f"  Retraining {model_name} on full training set...")
    model_pipeline.fit(X_train, y_train)
    
    # Evaluate on test set
    y_test_pred = model_pipeline.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Save the pipeline to disk using joblib
    model_filename = f"{model_name.lower()}_pipeline.pkl"
    model_path = MODELS_DIR / model_filename
    joblib.dump(model_pipeline, model_path)
    
    # Store model info
    saved_models_info[model_name] = {
        'filename': model_filename,
        'path': str(model_path),
        'test_rmse': float(test_rmse),
        'test_mae': float(test_mae),
        'test_r2': float(test_r2)
    }
    
    all_models_metadata['models'][model_name] = {
        'test_rmse': float(test_rmse),
        'test_mae': float(test_mae),
        'test_r2': float(test_r2),
        'filename': model_filename
    }
    
    print(f"  ‚úì {model_name} saved to: {model_path}")
    print(f"    Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}, Test R¬≤: {test_r2:.4f}")

# Save comprehensive metadata for all models
metadata_path = MODELS_DIR / "all_models_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(all_models_metadata, f, indent=2)
print(f"\n‚úì All models metadata saved to: {metadata_path}")

# Also save a summary with best model info
summary = {
    'best_model': best_model_name,
    'best_model_info': saved_models_info[best_model_name],
    'all_models': saved_models_info,
    'feature_columns': feature_cols,
    'categorical_columns': categorical_cols,
    'numerical_columns': numerical_cols,
    'ground_col': ground_col,
    'satellite_col': satellite_col,
    'scaling_factor': 50.0 if satellite_col == 'Aerosol_Index_satellite' else 1.0
}

summary_path = MODELS_DIR / "models_summary.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)
print(f"‚úì Models summary saved to: {summary_path}")

# Create a symlink/copy of the best model for convenience
best_model_path = MODELS_DIR / "best_model_pipeline.pkl"
best_model_source = MODELS_DIR / saved_models_info[best_model_name]['filename']
import shutil
shutil.copy(best_model_source, best_model_path)
print(f"‚úì Best model ({best_model_name}) also saved as: {best_model_path}")

print(f"\n{'='*60}")
print("All models saved successfully!")
print(f"{'='*60}")
print(f"\nSaved models:")
for model_name, info in saved_models_info.items():
    marker = " ‚≠ê BEST" if model_name == best_model_name else ""
    print(f"  - {model_name}: {info['filename']} (RMSE: {info['test_rmse']:.4f}){marker}")

print(f"\nTo load a model:")
print(f"  import joblib")
print(f"  model = joblib.load('models/{{model_name}}_pipeline.pkl')")
print(f"  predictions = model.predict(your_dataframe)")
print(f"{'='*60}")


NameError: name 'LinearRegression' is not defined

## 8. Prediction Function (Optional - for making predictions on new data)


In [None]:
# Function to predict discrepancy and corrected PM2.5
def predict_discrepancy(model_pipeline, features_df):
    """
    Predict the discrepancy between ground and satellite PM2.5.
    
    Args:
        model_pipeline: Trained model pipeline (preprocessor + model)
        features_df: DataFrame with feature columns
        
    Returns:
        Array of predicted discrepancies
    """
    return model_pipeline.predict(features_df)


def predict_corrected_pm25(model_pipeline, features_df, satellite_value, scaling_factor=1.0):
    """
    Predict corrected PM2.5 using satellite value and predicted discrepancy.
    
    Formula: corrected_pm25 = satellite_value * scaling_factor + predicted_difference
    
    Args:
        model_pipeline: Trained model pipeline
        features_df: DataFrame with feature columns
        satellite_value: Satellite PM2.5 or AOD value
        scaling_factor: Scaling factor for satellite value (default: 1.0)
        
    Returns:
        Array of corrected PM2.5 values
    """
    predicted_diff = predict_discrepancy(model_pipeline, features_df)
    corrected_pm25 = (satellite_value * scaling_factor) + predicted_diff
    return corrected_pm25


# Example: Load saved model and make predictions
# Uncomment and modify as needed:
"""
import joblib

# Load the saved model
loaded_model = joblib.load(best_model_path)

# Example: Predict on new data
# new_data = pd.DataFrame({
#     'NO2_satellite': [0.0002],
#     'SO2_satellite': [-0.0004],
#     'CO_satellite': [0.05],
#     'O3_satellite': [0.16],
#     'Aerosol_Index_satellite': [-1.0],
#     'location': ['Anand Vihar, Delhi'],
#     'month': [1],
#     'day_of_week': [2],
#     'season': ['Winter'],
#     # ... add all required features
# })

# predicted_diff = predict_discrepancy(loaded_model, new_data)
# corrected_pm25 = predict_corrected_pm25(loaded_model, new_data, -1.0, scaling_factor=50.0)

# print(f"Predicted discrepancy: {predicted_diff[0]:.2f}")
# print(f"Corrected PM2.5: {corrected_pm25[0]:.2f}")
"""

print("Prediction functions defined. Uncomment the example code above to use them.")


In [None]:
# Create a fresh instance of the best model (since previous one was trained on processed data)
model_classes = {
    'LinearRegression': LinearRegression,
    'Ridge': lambda: Ridge(alpha=1.0, random_state=42),
    'Lasso': lambda: Lasso(alpha=1.0, random_state=42),
    'RandomForest': lambda: RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'GradientBoosting': lambda: GradientBoostingRegressor(n_estimators=100, random_state=42),
}

if XGBOOST_AVAILABLE:
    model_classes['XGBoost'] = lambda: xgb.XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Create a fresh model instance
fresh_best_model = model_classes[best_model_name]()

# Create a complete pipeline (preprocessor + model)
best_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', fresh_best_model)
])

# Retrain on full training data
best_pipeline.fit(X_train, y_train)

# Save the best model using MLflow
with mlflow.start_run(run_name=f"{best_model_name}_final"):
    mlflow.log_param("model_name", f"{best_model_name}_final")
    mlflow.log_param("best_model", best_model_name)
    
    # Evaluate on test set
    y_test_pred = best_pipeline.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)
    
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)
    
    # Save the complete pipeline
    mlflow.sklearn.log_model(best_pipeline, "best_model")
    
    # Also save to local directory
    import joblib
    best_model_path = MODELS_DIR / "best_model.pkl"
    joblib.dump(best_pipeline, best_model_path)
    print(f"‚úì Best model saved to: {best_model_path}")

print(f"\n‚úì Final model saved using MLflow")
print(f"  Model: {best_model_name}")
print(f"  Test RMSE: {test_rmse:.4f}")
print(f"  Test MAE: {test_mae:.4f}")
print(f"  Test R¬≤: {test_r2:.4f}")
