In [1]:
!pip install -U -q numpy scikit-learn pandas xgboost lightgbm category_encoders matplotlib seaborn cloudpickle shap optuna

In [2]:
# REPRODUCIBILITY: Set all random seeds before any other imports
import numpy as np
import random
import os
from spatio_temporal import *

# Set all random seeds for full reproducibility
np.random.seed(42)
random.seed(42)
os.environ['PYTHONHASHSEED'] = '0'

print("✅ All random seeds set for reproducibility")

DIR = "model_spatiotemporal"

✅ All random seeds set for reproducibility


In [3]:
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
import pandas as pd

# Assume a dummy year since it's not provided
year = 2023

# Construct datetime from day_of_year and hour
train_df['datetime'] = pd.to_datetime(train_df['day_of_year'], format='%j', errors='coerce') \
                       + pd.to_timedelta(train_df['hour'], unit='h')
train_df['datetime'] = train_df['datetime'].apply(
    lambda dt: dt.replace(year=year) if pd.notnull(dt) else dt
)

# Sort by datetime column
train_df = train_df.sort_values(by='datetime')

# Drop the temporary datetime column
train_df = train_df.drop(columns='datetime')

train_df.reset_index(drop=True, inplace=True) # CRUCIAL #

In [5]:
print("\n: Analyzing distribution shifts...")
analyzer = SpatioTemporalDistributionAnalyzer()
spatial_stats, temporal_stats = analyzer.analyze(train_df, test_df)

# Quick summary
print("Significant shifts detected in:")
for feature, stats in temporal_stats.items():
    if stats['ks_pvalue'] < 0.05:
        print(f"  - {feature} (Wasserstein distance: {stats['wasserstein_distance']:.3f})")


: Analyzing distribution shifts...
Analyzing distribution differences...

Spatial Distribution:
  Latitude KS: 0.3457 (p=1.52e-214)
  Longitude KS: 0.2957 (p=3.77e-156)

Temporal Distribution Shifts:
  hour: Wasserstein=3.3287 (significant, p=1.29e-252)
  day_of_week: Wasserstein=1.8408 (significant, p=4.61e-321)
  month: Wasserstein=6.1064 (significant, p=0.00e+00)
  day_of_year: Wasserstein=175.2113 (significant, p=0.00e+00)
Significant shifts detected in:
  - hour (Wasserstein distance: 3.329)
  - day_of_week (Wasserstein distance: 1.841)
  - month (Wasserstein distance: 6.106)
  - day_of_year (Wasserstein distance: 175.211)


In [None]:
X_train = train_df.drop([
    "id",
    "pollution_value"], axis=1)
y_train = train_df["pollution_value"].copy()
X_test = test_df.drop("id", axis=1).copy()
test_ids = test_df["id"].copy()

Loaded and transformed with feature_generator_1.pkl successfully.
Loaded and transformed with feature_generator_2.pkl successfully.
Loaded and transformed with feature_generator_3.pkl successfully.


In [None]:
# ==============================================================================
# PHASE 2: ENHANCED CV PIPELINE WITH FULL FEATURE ENGINEERING
# ==============================================================================

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import optuna
from tabularaml.generate.features import FeatureGenerator
from copy import deepcopy
import warnings
import os
import re
warnings.filterwarnings('ignore')

def competition_metric(rmse: float) -> float:
    """Competition metric: exp(-RMSE / 100)"""
    return np.exp(-rmse / 100)

def load_and_apply_feature_generators(X_train_fold, X_val_fold, X_test_fold, model_dir):
    """
    Load feature generators from directory and apply them sequentially.
    Apply fit_transform on train, transform on val and test.
    """
    if not os.path.isdir(model_dir):
        return X_train_fold, X_val_fold, X_test_fold
    
    # Get all feature generator files and sort them
    fg_files = [f for f in os.listdir(model_dir) if re.match(r"feature_generator_\d+\.pkl", f)]
    fg_files = sorted(fg_files, key=lambda x: int(re.search(r"\d+", x).group()))
    
    if not fg_files:
        return X_train_fold, X_val_fold, X_test_fold
    
    for fg_file in fg_files:
        try:
            fg_path = os.path.join(model_dir, fg_file)
            fg = FeatureGenerator.load(fg_path)
            X_train_fold = fg.fit_transform(X_train_fold)
            X_val_fold = fg.transform(X_val_fold)
            X_test_fold = fg.transform(X_test_fold)
        except Exception as e:
            print(f"Failed to load {fg_file}: {e}")
            continue
    
    return X_train_fold, X_val_fold, X_test_fold

def enhanced_cv_pipeline(X_train, y_train, X_test, temporal_stats, model_dir="model_spatiotemporal"):
    """
    Enhanced CV pipeline with full feature engineering inside CV loops.
    Order: AdvancedSpatioTemporalFeatures FIRST, then AFE pickles.
    """
    print("=== ENHANCED CV PIPELINE ===")
    
    # Setup CV splitter with optimized parameters
    cv = SpatioTemporalCV(
        n_splits=5,
        test_spatial_coords=X_test[['latitude', 'longitude']].values,
        test_temporal_features=X_test[['hour', 'month', 'day_of_week', 'day_of_year']],
        spatial_weight=0.3,  # Emphasize temporal matching for January scarcity
        random_state=42
    )
    
    # Domain adaptation for January weighting
    domain_adapter = TemporalDomainAdaptation()
    domain_adapter.fit(X_train, X_test)
    sample_weights = domain_adapter.get_weights()
    
    # Boost January samples more aggressively
    january_mask = X_train['month'] == 1
    january_boost = 2.5  # Strong boost for January samples
    sample_weights[january_mask] *= january_boost
    sample_weights = np.clip(sample_weights, 0.1, 15.0)  # Clip to reasonable range
    
    print(f"January samples: {january_mask.sum()}, Average January weight: {sample_weights[january_mask].mean():.2f}")
    
    cv_scores = []
    january_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        print(f"Fold {fold + 1}/5...")
        
        # Split data
        X_train_fold = X_train.iloc[train_idx].copy()
        X_val_fold = X_train.iloc[val_idx].copy()
        X_test_fold = X_test.copy()
        y_train_fold = y_train.iloc[train_idx]
        y_val_fold = y_train.iloc[val_idx]
        
        # STEP 1: Apply AdvancedSpatioTemporalFeatures FIRST
        fe_full = AdvancedSpatioTemporalFeatures(
            row_only=False,  
            n_spatial_clusters=20,
            n_temporal_clusters=10,
            january_bridge_features=True,
            test_distribution=temporal_stats,
            use_distribution_matching=True
        )
        
        X_train_enhanced = fe_full.fit_transform(X_train_fold, y_train_fold)
        X_val_enhanced = fe_full.transform(X_val_fold)
        X_test_enhanced = fe_full.transform(X_test_fold)
        
        # STEP 2: Apply AFE feature generators SECOND
        X_train_enhanced, X_val_enhanced, X_test_enhanced = load_and_apply_feature_generators(
            X_train_enhanced, X_val_enhanced, X_test_enhanced, model_dir
        )
        
        # Handle NaNs using train fold statistics only
        train_means = X_train_enhanced.mean()
        X_train_enhanced = X_train_enhanced.fillna(train_means)
        X_val_enhanced = X_val_enhanced.fillna(train_means)
        
        # Scale using train fold statistics
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_enhanced)
        X_val_scaled = scaler.transform(X_val_enhanced)
        
        # Get weights for this fold
        weights_train = sample_weights[train_idx]
        
        # Train XGBoost with early stopping in constructor
        model = xgb.XGBRegressor(
            n_estimators=2000,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=1.0,
            reg_lambda=1.0,
            min_child_weight=3,
            early_stopping_rounds=50,
            random_state=42
        )
        
        model.fit(
            X_train_scaled, y_train_fold,
            sample_weight=weights_train,
            eval_set=[(X_val_scaled, y_val_fold)],
            verbose=False
        )
        
        # Evaluate
        val_pred = model.predict(X_val_scaled)
        fold_rmse = np.sqrt(mean_squared_error(y_val_fold, val_pred))
        cv_scores.append(fold_rmse)
        
        # Track January performance if any January samples in validation
        january_val_mask = X_train.iloc[val_idx]['month'] == 1
        if january_val_mask.sum() > 0:
            january_rmse = np.sqrt(mean_squared_error(
                y_val_fold[january_val_mask], 
                val_pred[january_val_mask]
            ))
            january_scores.append(january_rmse)
    
    # Results
    overall_cv_score = np.mean(cv_scores)
    january_cv_score = np.mean(january_scores) if january_scores else overall_cv_score
    
    print(f"Overall CV RMSE: {overall_cv_score:.4f}")
    print(f"January CV RMSE: {january_cv_score:.4f}")
    print(f"Competition Score: {competition_metric(overall_cv_score):.6f}")
    
    return {
        'cv_scores': cv_scores,
        'overall_cv_score': overall_cv_score,
        'january_cv_score': january_cv_score,
        'sample_weights': sample_weights,
        'feature_engineering': fe_full,
        'competition_score': competition_metric(overall_cv_score)
    }

In [None]:
# ==============================================================================
# PHASE 3: ADVANCED HYPERPARAMETER OPTIMIZATION WITH OPTUNA
# ==============================================================================

def hyperparameter_optimization(X_train, y_train, X_test, temporal_stats, model_dir="model_spatiotemporal", 
                               n_trials=100, timeout=7200, study_path="optuna_study_spatiotemporal.db"):
    """
    Advanced hyperparameter optimization with January-specific parameters.
    Optimizes competition score: exp(-RMSE/100) (maximize)
    
    Args:
        study_path: Path to SQLite database for persistent storage (default: "optuna_study_spatiotemporal.db")
    """
    print("=== HYPERPARAMETER OPTIMIZATION ===")
    
    # Setup CV
    cv = SpatioTemporalCV(
        n_splits=5,
        test_spatial_coords=X_test[['latitude', 'longitude']].values,
        test_temporal_features=X_test[['hour', 'month', 'day_of_week', 'day_of_year']],
        spatial_weight=0.3,
        random_state=42
    )
    
    # Domain adaptation base weights
    domain_adapter = TemporalDomainAdaptation()
    domain_adapter.fit(X_train, X_test)
    base_sample_weights = domain_adapter.get_weights()
    
    def objective(trial):
        # Suggest hyperparameters including January-specific ones
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 2500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'subsample': trial.suggest_float('subsample', 0.40, 0.95),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.40, 0.95),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 8.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 8.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 12),
            
            # January-specific parameters
            'january_weight_boost': trial.suggest_float('january_weight_boost', 1.5, 4.0),
            'max_weight_clip': trial.suggest_float('max_weight_clip', 10.0, 20.0),
            
            # Let Optuna suggest clustering parameters
            'n_spatial_clusters': trial.suggest_int('n_spatial_clusters', 15, 35),
            'n_temporal_clusters': trial.suggest_int('n_temporal_clusters', 8, 18),
        }
        
        # Cross-validation
        scores = []
        january_scores = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
            # Prepare fold data
            X_train_fold = X_train.iloc[train_idx].copy()
            X_val_fold = X_train.iloc[val_idx].copy()
            X_test_fold = X_test.copy()
            y_train_fold = y_train.iloc[train_idx]
            y_val_fold = y_train.iloc[val_idx]
            
            # STEP 1: Apply AdvancedSpatioTemporalFeatures FIRST
            fe_trial = AdvancedSpatioTemporalFeatures(
                row_only=False,
                n_spatial_clusters=params['n_spatial_clusters'],
                n_temporal_clusters=params['n_temporal_clusters'],
                january_bridge_features=True,
                test_distribution=temporal_stats,
                use_distribution_matching=True
            )
            
            X_train_enhanced = fe_trial.fit_transform(X_train_fold, y_train_fold)
            X_val_enhanced = fe_trial.transform(X_val_fold)
            X_test_enhanced = fe_trial.transform(X_test_fold)
            
            # STEP 2: Apply AFE pickles SECOND
            X_train_enhanced, X_val_enhanced, X_test_enhanced = load_and_apply_feature_generators(
                X_train_enhanced, X_val_enhanced, X_test_enhanced, model_dir
            )
            
            # Handle NaNs
            train_means = X_train_enhanced.mean()
            X_train_enhanced = X_train_enhanced.fillna(train_means)
            X_val_enhanced = X_val_enhanced.fillna(train_means)
            
            # Scale
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_enhanced)
            X_val_scaled = scaler.transform(X_val_enhanced)
            
            # Prepare weights with trial-specific January boosting
            weights_optimized = base_sample_weights[train_idx].copy()
            january_train_mask = X_train.iloc[train_idx]['month'] == 1
            weights_optimized[january_train_mask] *= params['january_weight_boost']
            weights_optimized = np.clip(weights_optimized, 0.1, params['max_weight_clip'])
            
            # Train model
            model = xgb.XGBRegressor(
                n_estimators=params['n_estimators'],
                learning_rate=params['learning_rate'],
                max_depth=params['max_depth'],
                subsample=params['subsample'],
                colsample_bytree=params['colsample_bytree'],
                reg_alpha=params['reg_alpha'],
                reg_lambda=params['reg_lambda'],
                min_child_weight=params['min_child_weight'],
                early_stopping_rounds=50,
                random_state=42
            )
            
            model.fit(
                X_train_scaled, y_train_fold, 
                sample_weight=weights_optimized,
                eval_set=[(X_val_scaled, y_val_fold)],
                verbose=False
            )
            
            # Evaluate
            val_pred = model.predict(X_val_scaled)
            fold_rmse = np.sqrt(mean_squared_error(y_val_fold, val_pred))
            scores.append(fold_rmse)
            
            # Track January performance
            january_val_mask = X_train.iloc[val_idx]['month'] == 1
            if january_val_mask.sum() > 0:
                january_rmse = np.sqrt(mean_squared_error(
                    y_val_fold[january_val_mask], 
                    val_pred[january_val_mask]
                ))
                january_scores.append(january_rmse)
        
        # Calculate score components
        overall_rmse = np.mean(scores)
        january_rmse = np.mean(january_scores) if january_scores else overall_rmse
        
        # Combined RMSE weighted towards January
        combined_rmse = 0.4 * overall_rmse + 0.6 * january_rmse
        
        # Convert to competition scores
        overall_competition_score = competition_metric(overall_rmse)
        january_competition_score = competition_metric(january_rmse)
        combined_competition_score = competition_metric(combined_rmse)
        
        # Store detailed results in trial user attributes for logging
        trial.set_user_attr("overall_rmse", overall_rmse)
        trial.set_user_attr("january_rmse", january_rmse)
        trial.set_user_attr("combined_rmse", combined_rmse)
        trial.set_user_attr("overall_comp_score", overall_competition_score)
        trial.set_user_attr("january_comp_score", january_competition_score)
        trial.set_user_attr("combined_comp_score", combined_competition_score)
        trial.set_user_attr("january_samples_in_val", sum(len(january_scores) for _ in range(len(january_scores))))
        
        # Print trial results
        print(f"Trial {trial.number:3d}: Overall RMSE={overall_rmse:.4f} (Score={overall_competition_score:.6f}), "
              f"Jan RMSE={january_rmse:.4f} (Score={january_competition_score:.6f}), "
              f"Combined RMSE={combined_rmse:.4f} (Score={combined_competition_score:.6f})")
        
        # Return negative competition score for minimization
        return -combined_competition_score
    
    # Setup persistent storage
    study_name = "spatiotemporal_competition_optimization"
    storage_url = f"sqlite:///{study_path}"
    
    # Calculate startup trials (10% of total)
    n_startup_trials = max(1, int(n_trials * 0.1))
    
    # Create or load study with TPE sampler - PROPER RESUME IMPLEMENTATION
    sampler = optuna.samplers.TPESampler(
        multivariate=True,
        n_startup_trials=n_startup_trials,  # Fixed: was n_warmup_steps
        seed=42
    )
    
    # Use load_if_exists=True for proper resume capability
    study = optuna.create_study(
        study_name=study_name,
        storage=storage_url,
        direction='minimize',  # Minimize negative competition score = maximize competition score
        sampler=sampler,
        load_if_exists=True  # This enables resume functionality
    )
    
    # Check existing trials and calculate remaining
    existing_trials = len(study.trials)
    
    if existing_trials > 0:
        print(f"📁 Resumed existing study with {existing_trials} completed trials. Will run another {n_trials} trials")
    else:
        print(f"🆕 Created new study, will run {n_trials} trials")
    
    print(f"⚙️  Using TPE sampler: multivariate=True, startup_trials={n_startup_trials}")
    print(f"💾 Storage: {storage_url}")
    
    # Only run if we have remaining trials
    print(f"🚀 Starting optimization...")
    study.optimize(objective, n_trials=n_trials, timeout=timeout)

    # Calculate final metrics
    best_negative_score = study.best_value
    best_competition_score = -best_negative_score
    
    # Get detailed results from best trial
    best_trial = study.best_trial
    best_overall_rmse = best_trial.user_attrs["overall_rmse"]
    best_january_rmse = best_trial.user_attrs["january_rmse"]
    best_combined_rmse = best_trial.user_attrs["combined_rmse"]
    
    print(f"\n{'='*60}")
    print(f"🏆 OPTIMIZATION COMPLETE ({len(study.trials)} total trials)")
    print(f"{'='*60}")
    print(f"📊 Best Results (Trial #{best_trial.number}):")
    print(f"   Overall RMSE: {best_overall_rmse:.4f} → Score: {competition_metric(best_overall_rmse):.6f}")
    print(f"   January RMSE: {best_january_rmse:.4f} → Score: {competition_metric(best_january_rmse):.6f}")
    print(f"   Combined RMSE: {best_combined_rmse:.4f} → Score: {best_competition_score:.6f}")
    print(f"\n🎯 Best Parameters:")
    for key, value in study.best_params.items():
        print(f"   {key}: {value}")
    
    # Study statistics
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if len(completed_trials) > 1:
        scores = [-t.value for t in completed_trials]
        print(f"\n📈 Study Statistics:")
        print(f"   Best Score: {max(scores):.6f}")
        print(f"   Mean Score: {np.mean(scores):.6f}")
        print(f"   Std Score: {np.std(scores):.6f}")
        print(f"   Improvement: {max(scores) - min(scores):.6f}")
    
    return study.best_params, best_combined_rmse, study

In [None]:
# ==============================================================================
# PHASE 4: ENSEMBLE STRATEGY AND FINAL MODEL
# ==============================================================================

def create_ensemble_variants(best_params, X_train, y_train, X_test, temporal_stats, model_dir="model_spatiotemporal"):
    """
    Create multiple model variants for ensemble diversity.
    """
    print("=== CREATING ENSEMBLE VARIANTS ===")
    
    # Base optimized parameters
    base_params = {k: v for k, v in best_params.items() 
                   if k not in ['january_weight_boost', 'max_weight_clip', 'n_spatial_clusters', 'n_temporal_clusters']}
    
    # Variant 1: Conservative (higher regularization)
    conservative_params = base_params.copy()
    conservative_params.update({
        'reg_alpha': base_params['reg_alpha'] * 1.3,
        'reg_lambda': base_params['reg_lambda'] * 1.3,
        'learning_rate': base_params['learning_rate'] * 0.8,
        'january_weight_boost': best_params['january_weight_boost'] * 0.8
    })
    
    # Variant 2: Aggressive (lower regularization, higher learning rate)
    aggressive_params = base_params.copy()
    aggressive_params.update({
        'reg_alpha': base_params['reg_alpha'] * 0.7,
        'reg_lambda': base_params['reg_lambda'] * 0.7,
        'learning_rate': min(0.12, base_params['learning_rate'] * 1.2),
        'january_weight_boost': best_params['january_weight_boost'] * 1.1
    })
    
    # Variant 3: January-focused (extra January emphasis)
    january_params = base_params.copy()
    january_params.update({
        'january_weight_boost': best_params['january_weight_boost'] * 1.4,
        'max_depth': max(4, best_params['max_depth'] - 1),
        'min_child_weight': best_params['min_child_weight'] + 1
    })
    
    ensemble_variants = {
        'conservative': conservative_params,
        'aggressive': aggressive_params, 
        'january_focused': january_params,
        'optimized': base_params
    }
    
    for name, params in ensemble_variants.items():
        # Add back non-model parameters
        params['january_weight_boost'] = params.get('january_weight_boost', best_params['january_weight_boost'])
        params['max_weight_clip'] = best_params['max_weight_clip']
        params['n_spatial_clusters'] = best_params['n_spatial_clusters']
        params['n_temporal_clusters'] = best_params['n_temporal_clusters']
    
    return ensemble_variants

def train_final_ensemble(ensemble_variants, X_train, y_train, X_test, temporal_stats, model_dir="model_spatiotemporal"):
    """
    Train final ensemble with all variants and create submission.
    Order: AdvancedSpatioTemporalFeatures FIRST, then AFE pickles.
    """
    print("=== TRAINING FINAL ENSEMBLE ===")
    
    # Domain adaptation weights
    domain_adapter = TemporalDomainAdaptation()
    domain_adapter.fit(X_train, X_test)
    base_weights = domain_adapter.get_weights()
    
    ensemble_predictions = {}
    
    for variant_name, params in ensemble_variants.items():
        print(f"Training {variant_name} variant...")
        
        # Prepare full training data
        X_train_full = X_train.copy()
        X_test_full = X_test.copy()
        
        # STEP 1: Apply AdvancedSpatioTemporalFeatures FIRST
        fe_final = AdvancedSpatioTemporalFeatures(
            row_only=False,
            n_spatial_clusters=params['n_spatial_clusters'],
            n_temporal_clusters=params['n_temporal_clusters'],
            january_bridge_features=True,
            test_distribution=temporal_stats,
            use_distribution_matching=True
        )
        
        X_train_enhanced = fe_final.fit_transform(X_train_full, y_train)
        X_test_enhanced = fe_final.transform(X_test_full)
        
        # STEP 2: Apply AFE feature generators SECOND
        X_train_enhanced, _, X_test_enhanced = load_and_apply_feature_generators(
            X_train_enhanced, X_train_enhanced.iloc[:0], X_test_enhanced, model_dir
        )
        
        # Handle NaNs
        train_means = X_train_enhanced.mean()
        X_train_enhanced = X_train_enhanced.fillna(train_means)
        X_test_enhanced = X_test_enhanced.fillna(train_means)
        
        # Scale
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_enhanced)
        X_test_scaled = scaler.transform(X_test_enhanced)
        
        # Prepare sample weights
        sample_weights = base_weights.copy()
        january_mask = X_train['month'] == 1
        sample_weights[january_mask] *= params['january_weight_boost']
        sample_weights = np.clip(sample_weights, 0.1, params['max_weight_clip'])
        
        # Train final model - REMOVE early_stopping_rounds for final training
        model_params = {k: v for k, v in params.items() 
                       if k not in ['january_weight_boost', 'max_weight_clip', 'n_spatial_clusters', 'n_temporal_clusters']}
        # Remove early_stopping_rounds for final model (no validation set)
        model_params.pop('early_stopping_rounds', None)
        
        final_model = xgb.XGBRegressor(**model_params, random_state=42)
        final_model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
        
        # Predict
        predictions = final_model.predict(X_test_scaled)
        ensemble_predictions[variant_name] = predictions
    
    return ensemble_predictions

def create_final_submission(ensemble_predictions, X_test, test_ids):
    """
    Create final submission with weighted ensemble.
    """
    print("=== CREATING FINAL SUBMISSION ===")
    
    # Ensemble weights
    ensemble_weights = {
        'conservative': 0.2,
        'aggressive': 0.2,
        'january_focused': 0.35,  # Higher weight for January-focused variant
        'optimized': 0.25
    }
    
    # Create weighted ensemble
    final_predictions = np.zeros(len(X_test))
    for variant_name, predictions in ensemble_predictions.items():
        weight = ensemble_weights[variant_name]
        final_predictions += weight * predictions
    
    # Apply January-specific post-processing
    january_test_mask = X_test['month'] == 1
    if january_test_mask.sum() > 0:
        high_latitude_january = january_test_mask & (X_test['latitude'] > 45)
        if high_latitude_january.sum() > 0:
            final_predictions[high_latitude_january] *= 1.05
    
    # Post-process predictions
    final_predictions = np.maximum(final_predictions, 0)
    final_predictions = np.minimum(final_predictions, 200)
    
    # Create submission
    submission = pd.DataFrame({
        'id': test_ids,
        'pollution_value': final_predictions
    })
    
    print(f"Final predictions: Range=[{final_predictions.min():.2f}, {final_predictions.max():.2f}], Mean={final_predictions.mean():.2f}")
    
    # January-specific stats
    if january_test_mask.sum() > 0:
        january_preds = final_predictions[january_test_mask]
        print(f"January: Mean={january_preds.mean():.2f}, Std={january_preds.std():.2f}")
    
    return submission, final_predictions

In [None]:
# ==============================================================================
# COMPLETE PIPELINE EXECUTION
# ==============================================================================

print("🚀 STARTING COMPLETE COMPETITION PIPELINE")
print("=" * 60)

# Step 1: Run Enhanced CV Pipeline
print("\n📊 STEP 1: Enhanced CV Pipeline")
cv_results = enhanced_cv_pipeline(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    temporal_stats=temporal_stats,
    model_dir=DIR
)

print(f"✅ CV Results: RMSE={cv_results['overall_cv_score']:.4f}, Competition Score={cv_results['competition_score']:.6f}")

# Step 2: Hyperparameter Optimization
print("\n🎯 STEP 2: Hyperparameter Optimization")
best_params, best_rmse, study = hyperparameter_optimization(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    temporal_stats=temporal_stats,
    model_dir=DIR,
    n_trials=1000,
    timeout=3*3600 
)

best_competition_score = competition_metric(best_rmse)
print(f"✅ Optimization complete: Competition Score={best_competition_score:.6f}")

# Step 3: Create Ensemble Variants
print("\n🎭 STEP 3: Creating Ensemble Variants")
ensemble_variants = create_ensemble_variants(
    best_params=best_params,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    temporal_stats=temporal_stats,
    model_dir=DIR
)

print(f"✅ Created {len(ensemble_variants)} ensemble variants")

# Step 4: Train Final Ensemble
print("\n🏆 STEP 4: Training Final Ensemble")
ensemble_predictions = train_final_ensemble(
    ensemble_variants=ensemble_variants,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    temporal_stats=temporal_stats,
    model_dir=DIR
)

# Step 5: Create Final Submission
print("\n📋 STEP 5: Creating Final Submission")
final_submission, final_predictions = create_final_submission(
    ensemble_predictions=ensemble_predictions,
    X_test=X_test,
    test_ids=test_ids
)

# Save submission with competition score in filename
filename = f"submission_enhanced_{best_competition_score:.6f}_rmse_{best_rmse:.4f}.csv"
final_submission.to_csv(filename, index=False)

print("\n" + "=" * 60)
print("🎉 PIPELINE COMPLETE!")
print("=" * 60)
print(f"🏅 Best Competition Score: {best_competition_score:.6f}")
print(f"📈 Estimated RMSE: {best_rmse:.4f}")
print(f"📊 January Performance: {cv_results['january_cv_score']:.4f}")
print(f"💾 Submission saved: {filename}")
print(f"🗓️ January test samples: {(X_test['month'] == 1).sum()}")

# Save Optuna study for analysis
study.trials_dataframe().to_csv(f"optuna_trials_{best_competition_score:.6f}.csv", index=False)
print(f"💾 Optuna trials saved: optuna_trials_{best_competition_score:.6f}.csv")

print("\n✨ Ready for submission! Expected strong performance on January-heavy test data! ✨")

In [None]:
# ==============================================================================
# ADVERSARIAL VALIDATION & SUBMISSION VARIANTS (OPTIONAL)
# ==============================================================================

def adversarial_validation_analysis(X_train, X_test, final_predictions):
    """
    Identify train samples most similar to test distribution for robustness check.
    """
    print("=== ADVERSARIAL VALIDATION ANALYSIS ===")
    
    # Combine train and test for adversarial validation
    X_combined = pd.concat([
        X_train[['latitude', 'longitude', 'hour', 'day_of_week', 'month', 'day_of_year']],
        X_test[['latitude', 'longitude', 'hour', 'day_of_week', 'month', 'day_of_year']]
    ], ignore_index=True)
    
    # Create target: 0 for train, 1 for test
    y_adversarial = np.concatenate([
        np.zeros(len(X_train)),
        np.ones(len(X_test))
    ])
    
    # Train classifier to distinguish train vs test
    from sklearn.ensemble import RandomForestClassifier
    adversarial_model = RandomForestClassifier(n_estimators=100, random_state=42)
    adversarial_model.fit(X_combined, y_adversarial)
    
    # Get prediction probabilities for train samples
    train_test_probs = adversarial_model.predict_proba(X_combined[:len(X_train)])[:, 1]
    
    # Identify most test-like training samples
    test_like_threshold = np.percentile(train_test_probs, 90)  # Top 10% most test-like
    test_like_mask = train_test_probs > test_like_threshold
    
    print(f"Found {test_like_mask.sum()} test-like training samples (top 10%)")
    print(f"January samples in test-like: {X_train[test_like_mask]['month'].eq(1).sum()}")
    print(f"Test-like sample score threshold: {test_like_threshold:.3f}")
    
    # Analyze prediction consistency
    january_test_preds = final_predictions[X_test['month'] == 1]
    all_test_preds = final_predictions
    
    print(f"\nPrediction Analysis:")
    print(f"January test predictions - Mean: {january_test_preds.mean():.2f}, Std: {january_test_preds.std():.2f}")
    print(f"All test predictions - Mean: {all_test_preds.mean():.2f}, Std: {all_test_preds.std():.2f}")
    
    return test_like_mask, train_test_probs

def create_submission_variants(ensemble_predictions, X_test, test_ids):
    """
    Create multiple submission variants for robustness.
    """
    print("=== CREATING SUBMISSION VARIANTS ===")
    
    submissions = {}
    
    # Variant 1: Conservative ensemble (equal weights)
    conservative_preds = np.mean([pred for pred in ensemble_predictions.values()], axis=0)
    conservative_preds = np.maximum(conservative_preds, 0)
    submissions['conservative'] = pd.DataFrame({
        'id': test_ids,
        'pollution_value': conservative_preds
    })
    
    # Variant 2: January-focused (boost January-focused model)
    january_focused_preds = (
        0.15 * ensemble_predictions['conservative'] +
        0.15 * ensemble_predictions['aggressive'] +
        0.50 * ensemble_predictions['january_focused'] +  # Heavy January weight
        0.20 * ensemble_predictions['optimized']
    )
    january_focused_preds = np.maximum(january_focused_preds, 0)
    submissions['january_heavy'] = pd.DataFrame({
        'id': test_ids,
        'pollution_value': january_focused_preds
    })
    
    # Variant 3: Median ensemble (robust to outliers)
    median_preds = np.median([pred for pred in ensemble_predictions.values()], axis=0)
    median_preds = np.maximum(median_preds, 0)
    submissions['median'] = pd.DataFrame({
        'id': test_ids,
        'pollution_value': median_preds
    })
    
    # Variant 4: Optimized single model
    optimized_preds = ensemble_predictions['optimized']
    optimized_preds = np.maximum(optimized_preds, 0)
    submissions['single_optimized'] = pd.DataFrame({
        'id': test_ids,
        'pollution_value': optimized_preds
    })
    
    # Save all variants
    for name, submission in submissions.items():
        filename = f"submission_variant_{name}.csv"
        submission.to_csv(filename, index=False)
        
        # Stats
        preds = submission['pollution_value'].values
        january_mask = X_test['month'] == 1
        january_preds = preds[january_mask]
        
        print(f"{name:15s}: Mean={preds.mean():.2f}, Jan_Mean={january_preds.mean():.2f}, "
              f"Range=[{preds.min():.1f}, {preds.max():.1f}] -> {filename}")
    
    return submissions

# Run adversarial validation
if 'final_predictions' in locals() and 'X_train' in locals():
    test_like_mask, adversarial_probs = adversarial_validation_analysis(
        X_train, X_test, final_predictions
    )
    
    # Create submission variants
    if 'ensemble_predictions' in locals():
        submission_variants = create_submission_variants(
            ensemble_predictions, X_test, test_ids
        )
        
        print(f"\n✅ Created {len(submission_variants)} submission variants")
        print("📋 Files saved:")
        for variant in submission_variants.keys():
            print(f"  - submission_variant_{variant}.csv")
    
else:
    print("⏭️  Skipping adversarial validation - run main pipeline first")

print("\n🎯 RECOMMENDATION:")
print("1. Try 'january_heavy' variant if LB shows January struggles")
print("2. Use 'conservative' variant for safety")
print("3. Compare 'median' vs main ensemble for robustness")
print("4. Monitor 'single_optimized' to verify ensemble value")