In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

from spatio_temporal import (
    AdvancedSpatioTemporalFeatures, 
    SpatioTemporalCV,
    TemporalDomainAdaptation,
    SpatioTemporalDistributionAnalyzer
)

In [4]:
print("Step 1: Loading data...")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Quick data quality check
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Missing values: {train_df.isnull().sum().sum()}")


Step 1: Loading data...
Train shape: (7649, 8)
Test shape: (2739, 7)
Missing values: 26


In [5]:
print("\nStep 2: Analyzing distribution shifts...")
analyzer = SpatioTemporalDistributionAnalyzer()
spatial_stats, temporal_stats = analyzer.analyze(train_df, test_df)

# Quick summary
print("Significant shifts detected in:")
for feature, stats in temporal_stats.items():
    if stats['ks_pvalue'] < 0.05:
        print(f"  - {feature} (Wasserstein distance: {stats['wasserstein_distance']:.3f})")



Step 2: Analyzing distribution shifts...
Analyzing distribution differences...

Spatial Distribution:
  Latitude KS: 0.3457 (p=1.52e-214)
  Longitude KS: 0.2957 (p=3.77e-156)

Temporal Distribution Shifts:
  hour: Wasserstein=3.3287 (significant, p=1.29e-252)
  day_of_week: Wasserstein=1.8408 (significant, p=4.61e-321)
  month: Wasserstein=6.1064 (significant, p=0.00e+00)
  day_of_year: Wasserstein=175.2113 (significant, p=0.00e+00)
Significant shifts detected in:
  - hour (Wasserstein distance: 3.329)
  - day_of_week (Wasserstein distance: 1.841)
  - month (Wasserstein distance: 6.106)
  - day_of_year (Wasserstein distance: 175.211)


In [6]:
print("\nStep 3: Preparing features...")
X_train = train_df.drop('pollution_value', axis=1)
y_train = train_df['pollution_value']
X_test = test_df.copy()



Step 3: Preparing features...


In [7]:
print("\nStep 4: Engineering features for validation...")

# Initialize with row_only=True to prevent leakage
fe_validator = AdvancedSpatioTemporalFeatures(
    row_only=True,  # IMPORTANT: Prevents leakage during CV
    n_spatial_clusters=20,
    n_temporal_clusters=10,
    use_distribution_matching=True,
    test_distribution=temporal_stats
)

# Transform for validation
X_train_val = fe_validator.fit_transform(X_train, y_train)
print(f"Features for validation: {X_train_val.shape[1]}")



Step 4: Engineering features for validation...
Features for validation: 44


In [9]:
print("\nStep 5: Setting up spatio-temporal aware CV...")

cv = SpatioTemporalCV(
    n_splits=5,
    test_spatial_coords=test_df[['latitude', 'longitude']].values,
    test_temporal_features=test_df[['hour', 'month', 'day_of_week', 'day_of_year']],
    spatial_weight=0.5,  # Balance between spatial and temporal matching
    random_state=42
)



Step 5: Setting up spatio-temporal aware CV...


In [10]:
print("\nStep 6: Model validation with distribution-aware CV...")

import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Define models to test
models = {
    'lightgbm': lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        verbose=-1
    ),
    'xgboost': xgb.XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        random_state=42
    ),
    'random_forest': RandomForestRegressor(
        n_estimators=300,
        max_depth=15,
        random_state=42,
        n_jobs=-1
    )
}

# Validate each model
cv_results = {}

for model_name, model in models.items():
    print(f"\nValidating {model_name}...")
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X_train_val, y_train)):
        # Split data
        X_tr, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Scale features
        scaler = StandardScaler()
        X_tr_scaled = scaler.fit_transform(X_tr)
        X_val_scaled = scaler.transform(X_val)
        
        # Train model
        if model_name in ['lightgbm', 'xgboost']:
            model.fit(
                X_tr_scaled, y_tr,
                eval_set=[(X_val_scaled, y_val)],
                early_stopping_rounds=50,
                verbose=False
            )
        else:
            model.fit(X_tr_scaled, y_tr)
        
        # Evaluate
        y_pred = model.predict(X_val_scaled)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        fold_scores.append(rmse)
        
        print(f"  Fold {fold + 1}: RMSE = {rmse:.4f}")
    
    avg_score = np.mean(fold_scores)
    cv_results[model_name] = {
        'mean_rmse': avg_score,
        'std_rmse': np.std(fold_scores),
        'scores': fold_scores
    }
    print(f"  Average: {avg_score:.4f} (Â±{np.std(fold_scores):.4f})")

# Select best model
best_model_name = min(cv_results.keys(), key=lambda x: cv_results[x]['mean_rmse'])
print(f"\nBest model: {best_model_name} (RMSE: {cv_results[best_model_name]['mean_rmse']:.4f})")



Step 6: Model validation with distribution-aware CV...

Validating lightgbm...


TypeError: fit() got an unexpected keyword argument 'early_stopping_rounds'

In [11]:
print("\nStep 7: Training final model with all features...")

# Create feature engineer without restrictions
fe_final = AdvancedSpatioTemporalFeatures(
    row_only=False,  # Use ALL features including cluster-based ones
    n_spatial_clusters=30,  # More clusters for final model
    n_temporal_clusters=15,
    use_distribution_matching=True,
    test_distribution=temporal_stats
)

# Apply domain adaptation weights
domain_adapter = TemporalDomainAdaptation(method='importance_weighting')
domain_adapter.fit(X_train, X_test)
sample_weights = domain_adapter.get_weights()

print(f"Domain adaptation weights range: [{sample_weights.min():.2f}, {sample_weights.max():.2f}]")

# Transform with all features
X_train_final = fe_final.fit_transform(X_train, y_train)
X_test_final = fe_final.transform(X_test)

print(f"Final feature count: {X_train_final.shape[1]}")

# Scale
scaler_final = StandardScaler()
X_train_scaled = scaler_final.fit_transform(X_train_final)
X_test_scaled = scaler_final.transform(X_test_final)



Step 7: Training final model with all features...
Domain adaptation weights range: [0.24, 23.80]
Final feature count: 61


In [12]:
print("\nStep 8: Training ensemble of best models...")

# Use top 3 models from validation
top_models = sorted(cv_results.items(), key=lambda x: x[1]['mean_rmse'])[:3]
ensemble_predictions = []
ensemble_weights = []

for model_name, scores in top_models:
    print(f"\nTraining {model_name}...")
    model = models[model_name]
    
    # Train with sample weights
    if model_name in ['lightgbm', 'xgboost']:
        model.fit(
            X_train_scaled, y_train,
            sample_weight=sample_weights,
            verbose=False
        )
    else:
        model.fit(X_train_scaled, y_train, sample_weight=sample_weights)
    
    # Predict
    predictions = model.predict(X_test_scaled)
    ensemble_predictions.append(predictions)
    
    # Weight inversely proportional to CV error
    weight = 1.0 / scores['mean_rmse']
    ensemble_weights.append(weight)

# Normalize weights
ensemble_weights = np.array(ensemble_weights)
ensemble_weights = ensemble_weights / ensemble_weights.sum()

print(f"\nEnsemble weights: {dict(zip([m[0] for m in top_models], ensemble_weights))}")

# Weighted average
final_predictions = np.average(ensemble_predictions, axis=0, weights=ensemble_weights)



Step 8: Training ensemble of best models...

Ensemble weights: {}


ZeroDivisionError: Weights sum to zero, can't be normalized

In [13]:
print("\nStep 9: Creating submission...")

# Post-processing
final_predictions = np.maximum(final_predictions, 0)  # Ensure non-negative

# Check prediction statistics
print(f"\nPrediction statistics:")
print(f"  Mean: {final_predictions.mean():.4f}")
print(f"  Std: {final_predictions.std():.4f}")
print(f"  Min: {final_predictions.min():.4f}")
print(f"  Max: {final_predictions.max():.4f}")

# Compare to training target distribution
print(f"\nTraining target statistics:")
print(f"  Mean: {y_train.mean():.4f}")
print(f"  Std: {y_train.std():.4f}")

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'pollution_value': final_predictions
})

# Save
submission.to_csv('submission_advanced.csv', index=False)
print("\nSubmission saved to 'submission_advanced.csv'!")



Step 9: Creating submission...


NameError: name 'final_predictions' is not defined

In [14]:
print("\nStep 10: Analyzing feature importance...")

# Get feature importance from best model
if best_model_name == 'lightgbm':
    # Retrain single model for feature importance
    best_model = models[best_model_name]
    best_model.fit(X_train_scaled, y_train, sample_weight=sample_weights, verbose=False)
    
    feature_importance = pd.DataFrame({
        'feature': X_train_final.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 15 most important features:")
    for idx, row in feature_importance.head(15).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")



Step 10: Analyzing feature importance...


NameError: name 'best_model_name' is not defined

In [15]:
print("\n" + "="*60)
print("COMPLETE! Summary of what we did:")
print("="*60)
print("1. Analyzed distribution shifts between train and test")
print("2. Created distribution-aware features")
print("3. Used custom CV that matches test distribution")
print("4. Applied domain adaptation weights")
print("5. Trained ensemble of best models")
print("6. Post-processed predictions")
print("\nKey innovations:")
print("- Spatial clustering features for location generalization")
print("- Temporal pattern clustering")
print("- Distribution-matching CV strategy")
print("- Domain adaptation for temporal shifts")
print("- Two-stage feature engineering (safe for CV + full for final)")



COMPLETE! Summary of what we did:
1. Analyzed distribution shifts between train and test
2. Created distribution-aware features
3. Used custom CV that matches test distribution
4. Applied domain adaptation weights
5. Trained ensemble of best models
6. Post-processed predictions

Key innovations:
- Spatial clustering features for location generalization
- Temporal pattern clustering
- Distribution-matching CV strategy
- Domain adaptation for temporal shifts
- Two-stage feature engineering (safe for CV + full for final)


In [16]:
print("\n" + "="*60)
print("TIPS FOR FURTHER IMPROVEMENT:")
print("="*60)
print("""
1. Hyperparameter tuning:
   - Use Optuna with the custom CV
   - Focus on regularization parameters
   
2. More sophisticated features:
   - Voronoi tessellation features
   - Kriging/spatial interpolation
   - Weather API data if allowed
   
3. Advanced models:
   - Neural networks with spatial/temporal embeddings
   - Gradient boosting with custom objectives
   - Hierarchical models
   
4. Ensemble strategies:
   - Stacking with out-of-fold predictions
   - Bayesian model averaging
   - Rank averaging for robustness
   
5. Post-processing:
   - Isotonic regression calibration
   - Outlier detection and clipping
   - Test-time augmentation

Remember: The key is handling the distribution shift properly!
""")


TIPS FOR FURTHER IMPROVEMENT:

1. Hyperparameter tuning:
   - Use Optuna with the custom CV
   - Focus on regularization parameters
   
2. More sophisticated features:
   - Voronoi tessellation features
   - Kriging/spatial interpolation
   - Weather API data if allowed
   
3. Advanced models:
   - Neural networks with spatial/temporal embeddings
   - Gradient boosting with custom objectives
   - Hierarchical models
   
4. Ensemble strategies:
   - Stacking with out-of-fold predictions
   - Bayesian model averaging
   - Rank averaging for robustness
   
5. Post-processing:
   - Isotonic regression calibration
   - Outlier detection and clipping
   - Test-time augmentation

Remember: The key is handling the distribution shift properly!

