In [1]:
!pip install -U -q numpy scikit-learn pandas xgboost lightgbm category_encoders matplotlib seaborn cloudpickle shap optuna

# Enhanced ML Pipeline - Modular Feature Generation

In [1]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KernelDensity
import xgboost as xgb
import lightgbm as lgb
import optuna
from tabularaml.generate.features import FeatureGenerator
from tabularaml.eval.scorers import Scorer
import gc

In [2]:
import numpy as np
import pandas as pd
from typing import Generator, Tuple, Optional, Union
from sklearn.model_selection import BaseCrossValidator
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_X_y, indexable
from sklearn.utils import check_random_state
from scipy.spatial.distance import cdist
import warnings


class SpatialTemporalKFold(BaseCrossValidator):
    """
    Spatial-Temporal Cross-Validator for geographic time-series data.
    
    This cross-validator creates folds that respect both spatial and temporal 
    dependencies in the data, preventing data leakage in spatial-temporal 
    prediction tasks like air pollution forecasting.
    
    The strategy:
    1. Creates spatial clusters using geographic coordinates (lat/lon).
    2. Creates temporal clusters using cyclical time features.
    3. Combines spatial-temporal groups to ensure validation sets are 
       spatially and temporally separated from training sets.
    4. Optionally stratifies by target variable ranges, with an option to
       use a log transform for skewed targets.
    
    Parameters
    ----------
    n_splits : int, default=5
        Number of cross-validation folds.
    spatial_clusters : int, default=20
        Number of spatial clusters for geographic grouping.
    temporal_clusters : int, default=8
        Number of temporal clusters for time-based grouping.
    lat_col : str, default='latitude'
        Column name for latitude coordinates.
    lon_col : str, default='longitude' 
        Column name for longitude coordinates.
    time_cols : dict, default=None
        Dictionary mapping time column names to their cycles.
    stratify : bool, default=True
        Whether to stratify splits by target variable quantiles.
    stratify_log_transform : bool, default=True
        If True, applies a log transform (log1p) to the target variable
        before stratification. Ideal for skewed targets.
    n_quantiles : int, default=5
        Number of quantiles for stratification (if stratify=True).
    random_state : int, default=None
        Random state for reproducible splits.
    shuffle : bool, default=True
        Whether to shuffle data before splitting.
    """
    
    def __init__(self, 
                 n_splits: int = 5,
                 spatial_clusters: int = 20,
                 temporal_clusters: int = 8,
                 lat_col: str = 'latitude',
                 lon_col: str = 'longitude',
                 time_cols: Optional[dict] = None,
                 stratify: bool = True,
                 stratify_log_transform: bool = True,
                 n_quantiles: int = 5,
                 random_state: Optional[int] = None,
                 shuffle: bool = True):
        
        self.n_splits = n_splits
        self.spatial_clusters = spatial_clusters
        self.temporal_clusters = temporal_clusters
        self.lat_col = lat_col
        self.lon_col = lon_col
        self.time_cols = time_cols or {
            'day_of_year': 365, 'hour': 24, 'day_of_week': 7, 'month': 12
        }
        self.stratify = stratify
        self.stratify_log_transform = stratify_log_transform
        self.n_quantiles = n_quantiles
        self.random_state = random_state
        self.shuffle = shuffle
        
        # Validation
        if n_splits < 2:
            raise ValueError("n_splits must be at least 2")
        if spatial_clusters < n_splits:
            warnings.warn(f"spatial_clusters ({spatial_clusters}) < n_splits ({n_splits}). "
                         "This may result in poor spatial separation.")
        if temporal_clusters < 2:
            raise ValueError("temporal_clusters must be at least 2")
    
    def get_n_splits(self, X=None, y=None, groups=None):
        """Returns the number of splitting iterations."""
        return self.n_splits
    
    def _validate_data(self, X: Union[pd.DataFrame, np.ndarray], 
                      y: Optional[Union[pd.Series, np.ndarray]] = None) -> pd.DataFrame:
        """Validate and convert input data to DataFrame."""
        if not isinstance(X, pd.DataFrame):
            if isinstance(X, np.ndarray):
                expected_cols = [self.lat_col, self.lon_col] + list(self.time_cols.keys())
                if X.shape[1] >= len(expected_cols):
                    X = pd.DataFrame(X, columns=expected_cols[:X.shape[1]])
                else:
                    raise ValueError(f"Expected at least {len(expected_cols)} columns, got {X.shape[1]}")
            else:
                raise TypeError("X must be pandas DataFrame or numpy array")
        
        missing_cols = [col for col in [self.lat_col, self.lon_col] + list(self.time_cols.keys()) if col not in X.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        return X
    
    def _create_spatial_clusters(self, X: pd.DataFrame) -> np.ndarray:
        """Create spatial clusters using K-means on lat/lon coordinates."""
        coords = X[[self.lat_col, self.lon_col]]
        n_clusters = min(self.spatial_clusters, len(coords))
        if n_clusters < self.spatial_clusters:
            warnings.warn(f"Reducing spatial_clusters from {self.spatial_clusters} to {n_clusters} due to insufficient data points.")
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, n_init=10)
        return kmeans.fit_predict(coords.copy().fillna(coords.mean()).values)
    
    def _create_temporal_features(self, X: pd.DataFrame) -> np.ndarray:
        """Convert cyclical time features to circular coordinates."""
        temporal_features = []
        for col, cycle_length in self.time_cols.items():
            if col in X.columns:
                radians = 2 * np.pi * X[col] / cycle_length
                temporal_features.extend([np.sin(radians), np.cos(radians)])
        
        return np.column_stack(temporal_features) if temporal_features else np.zeros((len(X), 0))
    
    def _create_temporal_clusters(self, X: pd.DataFrame) -> np.ndarray:
        """Create temporal clusters using cyclical time features."""
        temporal_coords = self._create_temporal_features(X)
        if temporal_coords.shape[1] == 0:
            return np.zeros(len(X), dtype=int)
            
        n_clusters = min(self.temporal_clusters, len(temporal_coords))
        if n_clusters < self.temporal_clusters:
            warnings.warn(f"Reducing temporal_clusters from {self.temporal_clusters} to {n_clusters} due to insufficient data points.")
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=self.random_state, n_init=10)
        return kmeans.fit_predict(temporal_coords)
    
    def _create_stratification_groups(self, y: np.ndarray) -> np.ndarray:
        """Create stratification groups based on target variable quantiles."""
        if not self.stratify or y is None:
            return np.zeros(len(y), dtype=int)
        
        y_stratify = y.copy()
        if self.stratify_log_transform:
            y_stratify = np.log1p(y_stratify)
        
        quantiles = np.linspace(0, 1, self.n_quantiles + 1)
        quantile_values = np.quantile(y_stratify, quantiles)
        
        return np.digitize(y_stratify, quantile_values[1:-1])
    
    def _create_combined_groups(self, X: pd.DataFrame, y: Optional[np.ndarray] = None) -> Tuple[np.ndarray, dict]:
        """Create combined spatial-temporal-stratification groups."""
        spatial_labels = self._create_spatial_clusters(X)
        temporal_labels = self._create_temporal_clusters(X)
        strat_labels = self._create_stratification_groups(y) if y is not None else np.zeros(len(X), dtype=int)
        
        max_spatial = np.max(spatial_labels) + 1
        max_temporal = np.max(temporal_labels) + 1
        max_strat = np.max(strat_labels) + 1
        
        combined_groups = (spatial_labels * max_temporal * max_strat + 
                           temporal_labels * max_strat + 
                           strat_labels)
        
        metadata = {
            'spatial_labels': spatial_labels,
            'temporal_labels': temporal_labels,
            'strat_labels': strat_labels,
        }
        
        return combined_groups, metadata
    
    def split(self, X: Union[pd.DataFrame, np.ndarray], 
              y: Optional[Union[pd.Series, np.ndarray]] = None, 
              groups: Optional[np.ndarray] = None) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
        """
        Generate indices to split data into training and test set.
        """
        X = self._validate_data(X, y)
        X, y, groups = indexable(X, y, groups)
        
        if y is not None:
            y = np.asarray(y)
            
        rng = check_random_state(self.random_state)
        n_samples = X.shape[0]
        indices = np.arange(n_samples)
        
        if self.shuffle:
            rng.shuffle(indices)
            X_shuffled = X.iloc[indices].reset_index(drop=True)
            y_shuffled = y[indices] if y is not None else None
        else:
            X_shuffled, y_shuffled = X, y

        combined_groups, _ = self._create_combined_groups(X_shuffled, y_shuffled)
        unique_groups = np.unique(combined_groups)
        
        if len(unique_groups) < self.n_splits:
            raise ValueError(f"Cannot create {self.n_splits} splits with only "
                             f"{len(unique_groups)} unique groups. "
                             f"Consider reducing n_splits or clustering parameters.")
        
        rng.shuffle(unique_groups)
        
        fold_groups = [[] for _ in range(self.n_splits)]
        for i, group in enumerate(unique_groups):
            fold_groups[i % self.n_splits].append(group)
        
        for fold_idx in range(self.n_splits):
            test_groups = np.array(fold_groups[fold_idx])
            test_mask = np.isin(combined_groups, test_groups)
            train_mask = ~test_mask
            
            original_test_indices = indices[test_mask]
            original_train_indices = indices[train_mask]
            
            if len(original_test_indices) == 0 or len(original_train_indices) == 0:
                warnings.warn(f"Fold {fold_idx} has an empty train/test set, skipping.")
                continue
                
            yield original_train_indices, original_test_indices


class StratifiedSpatialTemporalKFold(SpatialTemporalKFold):
    """
    A stratified version of SpatialTemporalKFold.
    
    This class enforces stratification by requiring the target variable `y` 
    and sets `stratify=True` by default. It is ideal for regression tasks 
    with skewed target distributions where maintaining a balanced distribution 
    in each fold is critical for reliable model evaluation.
    
    The key enhancement is the automatic application of a log transform to
    the target variable before creating stratification bins, which is highly
    effective for right-skewed data like pollution values.
    """
    
    def __init__(self, **kwargs):
        # Enforce stratification for this class
        kwargs['stratify'] = True
        # Default to log-transform for better handling of skewed targets
        if 'stratify_log_transform' not in kwargs:
            kwargs['stratify_log_transform'] = True
        super().__init__(**kwargs)
    
    def split(self, X, y, groups=None):
        """
        Generate indices to split data into training and test set.
        
        Parameters
        ----------
        X : DataFrame or array-like
            Training data.
        y : array-like
            The target variable for stratification. Must be provided.
        groups : array-like, optional
            Not used, for API compatibility.
            
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray  
            The testing set indices for that split.
        """
        if y is None:
            raise ValueError("StratifiedSpatialTemporalKFold requires the target variable 'y' for stratification.")
            
        yield from super().split(X, y, groups)

In [3]:
from sklearn.model_selection import BaseCrossValidator
import numpy as np

class FixedWindowTimeSeriesSplit(BaseCrossValidator):
    """
    Custom time-series cross-validator with fixed-size test windows.
    Ensures every fold has meaningful training data and proper temporal ordering.
    
    Parameters
    ----------
    n_splits : int
        Number of folds. Must be at least 1.
    test_size : int
        Number of samples in each test fold.
    gap : int, default=0
        Number of samples to exclude between train and test sets.
    min_train_size : int, default=None
        Minimum number of training samples required. If None, defaults to test_size.
    """
    
    def __init__(self, n_splits=5, test_size=2700, gap=0, min_train_size=None):
        if n_splits < 1:
            raise ValueError("n_splits must be at least 1.")
        if test_size < 1:
            raise ValueError("test_size must be at least 1.")
        if gap < 0:
            raise ValueError("gap must be non-negative.")
        
        self.n_splits = n_splits
        self.test_size = test_size
        self.gap = gap
        self.min_train_size = min_train_size or test_size
        
        if self.min_train_size < 1:
            raise ValueError("min_train_size must be at least 1.")
    
    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        
        # Check if we have enough data for at least one split
        min_required = self.min_train_size + self.gap + self.test_size
        if min_required > n_samples:
            raise ValueError(
                f"Not enough samples. Need at least {min_required} samples "
                f"(min_train_size={self.min_train_size} + gap={self.gap} + test_size={self.test_size}), "
                f"but got {n_samples}."
            )
        
        indices = np.arange(n_samples)
        
        if self.n_splits == 1:
            # Single window: place test at the end, ensure minimum training size
            test_end = n_samples
            test_start = test_end - self.test_size
            train_end = test_start - self.gap
            
            # Ensure we have minimum training size
            if train_end < self.min_train_size:
                train_end = self.min_train_size
                test_start = train_end + self.gap
                test_end = test_start + self.test_size
                
                # Check if this fits within our data
                if test_end > n_samples:
                    raise ValueError(
                        f"Cannot fit single split with constraints. "
                        f"Need {self.min_train_size + self.gap + self.test_size} samples, got {n_samples}."
                    )
            
            train_idx = indices[:train_end]
            test_idx = indices[test_start:test_end]
            yield train_idx, test_idx
            return
        
        # For multiple splits, distribute test windows
        # Last test window ends at n_samples, work backwards
        test_windows = []
        
        # Calculate positions for test windows
        # We want to distribute them evenly in the available space
        latest_test_end = n_samples
        earliest_test_start = self.min_train_size + self.gap
        
        # Available space for test window starts
        available_space = latest_test_end - self.test_size - earliest_test_start
        
        if available_space < 0:
            raise ValueError(
                "Cannot create requested splits. Try reducing n_splits, test_size, or min_train_size."
            )
        
        # Calculate step size between test windows
        if self.n_splits == 1:
            step = 0
        else:
            step = available_space / (self.n_splits - 1)
        
        # Generate test windows from last to first
        for i in range(self.n_splits):
            # Calculate test window position
            test_start = int(earliest_test_start + i * step)
            test_end = test_start + self.test_size
            
            # Ensure test window doesn't exceed data bounds
            if test_end > n_samples:
                test_end = n_samples
                test_start = test_end - self.test_size
            
            # Calculate training end (before gap)
            train_end = test_start - self.gap
            
            # Ensure minimum training size
            if train_end < self.min_train_size:
                raise ValueError(
                    f"Split {i+1} would have insufficient training data. "
                    f"Try reducing n_splits or min_train_size."
                )
            
            train_idx = indices[:train_end]
            test_idx = indices[test_start:test_end]
            
            yield train_idx, test_idx

In [4]:
def rmse_exp(y_true, y_pred):
    return np.exp(-np.sqrt(mean_squared_error(y_true, y_pred))/100)

def competition_score(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return np.exp(-rmse / 100)

def create_cyclical_features(df):
    df_copy = df.copy()
    
    if 'hour' in df_copy.columns:
        df_copy['hour_sin'] = np.sin(2 * np.pi * df_copy['hour'] / 24.0)
        df_copy['hour_cos'] = np.cos(2 * np.pi * df_copy['hour'] / 24.0)
    
    if 'day_of_week' in df_copy.columns:
        df_copy['dow_sin'] = np.sin(2 * np.pi * df_copy['day_of_week'] / 7.0)
        df_copy['dow_cos'] = np.cos(2 * np.pi * df_copy['day_of_week'] / 7.0)
    
    if 'day_of_year' in df_copy.columns:
        df_copy['doy_sin'] = np.sin(2 * np.pi * df_copy['day_of_year'] / 365.0)
        df_copy['doy_cos'] = np.cos(2 * np.pi * df_copy['day_of_year'] / 365.0)
    
    columns_to_drop = ['hour', 'day_of_week', 'day_of_year', 'month']
    existing_columns_to_drop = [col for col in columns_to_drop if col in df_copy.columns]
    if existing_columns_to_drop:
        df_copy = df_copy.drop(columns=existing_columns_to_drop)
    
    return df_copy

In [5]:
def run_single_feature_generation(X_train, y_train, save_dir="./features", run_id=1, max_new_feats=2000, n_generations=2000):
    os.makedirs(save_dir, exist_ok=True)
    
    splitter = SpatialTemporalKFold(n_splits=10, random_state=42+run_id)
    rmse_exp_scorer = Scorer(name="rmse_exp", scorer=rmse_exp, greater_is_better=True, extra_params={}, from_probs=False)
    
    print(f"Running feature generation run {run_id}...")
    
    generator = FeatureGenerator(
        task="regression",
        scorer=rmse_exp_scorer,
        max_new_feats=max_new_feats,
        cv=splitter,
        n_generations=n_generations,
        save_path=f"{save_dir}/feature_generator_run_{run_id}.pkl",
    )
    
    results = generator.search(X_train, y_train)
    X_generated = generator.transform(X_train)
    
    print(f"Run {run_id} complete: {X_generated.shape[1]} features generated")
    return X_generated, generator

In [6]:
def load_all_feature_generators(feature_dir):
    generator_files = glob.glob(os.path.join(feature_dir, "*.pkl"))
    if not generator_files:
        raise ValueError(f"No feature generator files found in {feature_dir}")
    
    generators = []
    print(f"Loading {len(generator_files)} feature generators from {feature_dir}: {sorted(generator_files)}.")
    for file_path in sorted(generator_files):
        generator = FeatureGenerator.load(file_path)
        generators.append(generator)

    return generators

def combine_feature_generators(generators, X_train, X_test=None):
    print(f"Combining features from {len(generators)} generators...")
    X_train = X_train.copy()
    if X_test is not None:
        X_test = X_test.copy()
    
    for i, generator in enumerate(generators):
        X_train = generator.fit_transform(X_train)
        if X_test is not None:
            X_test = generator.transform(X_test)
    
    if X_test is not None:
        return X_train, X_test
    return X_train

In [7]:
def xgb_objective(trial, X_base, y_train, tss, n_clusters_range=(20, 50)):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 30),
        'gamma': trial.suggest_float('gamma', 0, 20),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.3, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100.0, log=True),
        'max_leaves': trial.suggest_int('max_leaves', 0, 2000),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'max_bin': trial.suggest_int('max_bin', 32, 512),
        'objective': 'reg:squarederror',
        'enable_categorical': True,
        'random_state': 42,
        'n_jobs': -1
    }
    
    n_clusters = trial.suggest_int('n_clusters', n_clusters_range[0], n_clusters_range[1])
    
    X_base_cyclic = create_cyclical_features(X_base.copy())
    y = np.log1p(y_train.copy())
    
    fold_scores = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(tss.split(X_base_cyclic, y)):
        X_train_fold = X_base_cyclic.iloc[train_idx].copy()
        X_valid_fold = X_base_cyclic.iloc[valid_idx].copy()
        y_train_fold = y.iloc[train_idx]
        y_valid_fold = y.iloc[valid_idx]
        
        if 'latitude' in X_train_fold.columns and 'longitude' in X_train_fold.columns:
            lat_mean = X_train_fold['latitude'].mean()
            lon_mean = X_train_fold['longitude'].mean()
            
            train_coords_temp = X_train_fold[['latitude', 'longitude']].copy()
            valid_coords_temp = X_valid_fold[['latitude', 'longitude']].copy()
            
            train_coords_temp['latitude'].fillna(lat_mean, inplace=True)
            train_coords_temp['longitude'].fillna(lon_mean, inplace=True)
            valid_coords_temp['latitude'].fillna(lat_mean, inplace=True)
            valid_coords_temp['longitude'].fillna(lon_mean, inplace=True)
            
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
            kmeans.fit(train_coords_temp)
            
            X_train_fold['cluster'] = kmeans.predict(train_coords_temp)
            X_valid_fold['cluster'] = kmeans.predict(valid_coords_temp)
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_train_fold, y_train_fold, verbose=False)
        
        y_pred_fold = model.predict(X_valid_fold)
        y_pred_orig_scale = np.expm1(y_pred_fold)
        y_valid_orig_scale = np.expm1(y_valid_fold)
        
        exp_score = competition_score(y_valid_orig_scale, y_pred_orig_scale)
        fold_scores.append(exp_score)
    
    return np.mean(fold_scores)



def lgb_objective(trial, X_base, y_train, tss, n_clusters_range=(20, 50)):
    """Separate objective function for LightGBM optimization"""
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.3, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.3, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 100),
        'max_depth': trial.suggest_int('max_depth', -1, 20),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 20),
        'max_bin': trial.suggest_int('max_bin', 32, 512),
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'random_state': 42,
        'verbosity': -1
    }
    
    if params['max_depth'] == -1:
        params.pop('max_depth')
    
    n_clusters = trial.suggest_int('n_clusters', n_clusters_range[0], n_clusters_range[1])
    
    X_base_cyclic = create_cyclical_features(X_base.copy())
    y = np.log1p(y_train.copy())
    
    fold_scores = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(tss.split(X_base_cyclic, y)):
        X_train_fold = X_base_cyclic.iloc[train_idx].copy()
        X_valid_fold = X_base_cyclic.iloc[valid_idx].copy()
        y_train_fold = y.iloc[train_idx]
        y_valid_fold = y.iloc[valid_idx]
        
        if 'latitude' in X_train_fold.columns and 'longitude' in X_train_fold.columns:
            lat_mean = X_train_fold['latitude'].mean()
            lon_mean = X_train_fold['longitude'].mean()
            
            train_coords_temp = X_train_fold[['latitude', 'longitude']].copy()
            valid_coords_temp = X_valid_fold[['latitude', 'longitude']].copy()
            
            train_coords_temp['latitude'].fillna(lat_mean, inplace=True)
            train_coords_temp['longitude'].fillna(lon_mean, inplace=True)
            valid_coords_temp['latitude'].fillna(lat_mean, inplace=True)
            valid_coords_temp['longitude'].fillna(lon_mean, inplace=True)
            
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
            kmeans.fit(train_coords_temp)
            
            X_train_fold['cluster'] = kmeans.predict(train_coords_temp)
            X_valid_fold['cluster'] = kmeans.predict(valid_coords_temp)
        
        model = lgb.LGBMRegressor(**params)
        model.fit(
            X_train_fold, 
            y_train_fold,
        )
        
        y_pred_fold = model.predict(X_valid_fold)
        y_pred_orig_scale = np.expm1(y_pred_fold)
        y_valid_orig_scale = np.expm1(y_valid_fold)
        
        exp_score = competition_score(y_valid_orig_scale, y_pred_orig_scale)
        fold_scores.append(exp_score)
    
    return np.mean(fold_scores)

In [8]:
def train_final_model(X_train, y_train, params, model_type):
    """
    Trains a model (XGBoost or LightGBM) after performing feature engineering.

    This function combines preprocessing steps like cyclical feature creation and
    KMeans clustering with model training. It handles parameter cleaning for
    each model type and attaches the fitted preprocessors (like KMeans) to 
    the trained model object for use in prediction pipelines.

    Args:
        X_train (pd.DataFrame): The training feature data.
        y_train (pd.Series): The training target data.
        params (dict): A dictionary of hyperparameters for the model.
                       Can include 'n_clusters' for KMeans.
        model_type (str): The type of model to train, either 'xgb' or 'lgb'.

    Returns:
        A trained model object (XGBRegressor or LGBMRegressor) with
        preprocessing information (kmeans_, lat_mean_, lon_mean_) attached.
    """
    # 1. Preprocessing
    X_train_processed = create_cyclical_features(X_train.copy())
    
    # Initialize preprocessor attributes to None
    kmeans = None
    lat_mean = None
    lon_mean = None
    
    # Perform KMeans clustering if location data is available
    if 'latitude' in X_train_processed.columns and 'longitude' in X_train_processed.columns:
        n_clusters = params.get('n_clusters', 30)
        lat_mean = X_train_processed['latitude'].mean()
        lon_mean = X_train_processed['longitude'].mean()
        
        # Create a temporary dataframe for clustering, handling potential NaNs
        coords_temp = X_train_processed[['latitude', 'longitude']].copy()
        coords_temp['latitude'].fillna(lat_mean, inplace=True)
        coords_temp['longitude'].fillna(lon_mean, inplace=True)
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
        kmeans.fit(coords_temp)
        X_train_processed['cluster'] = kmeans.predict(coords_temp)
    
    # 2. Target Transformation
    y_train_log = np.log1p(y_train)
    
    # 3. Model-specific Training
    # Remove non-model parameters before passing to the model constructor
    model_params = {k: v for k, v in params.items() if k != 'n_clusters'}
    
    if model_type == 'xgb':
        model = xgb.XGBRegressor(**model_params)
        model.fit(X_train_processed, y_train_log)
    elif model_type == 'lgb':
        model = lgb.LGBMRegressor(**model_params)
        model.fit(X_train_processed, y_train_log)
    else:
        raise ValueError(f"Unsupported model_type: '{model_type}'. Choose 'xgb' or 'lgb'.")
    
    # 4. Attach preprocessing information to the trained model
    model.kmeans_ = kmeans
    model.lat_mean_ = lat_mean
    model.lon_mean_ = lon_mean
    
    return model

In [9]:
def create_optimized_ensemble(X_train, y_train, X_test, xgb_params, lgb_params, tss):
    """Create ensemble with separately optimized XGBoost and LightGBM"""
    predictions = []
    
    X_test_processed = create_cyclical_features(X_test.copy())
    
    xgb_n_clusters = xgb_params.get('n_clusters', 30)
    lgb_n_clusters = lgb_params.get('n_clusters', 30)
    
    if 'latitude' in X_test_processed.columns and 'longitude' in X_test_processed.columns:
        X_train_temp = create_cyclical_features(X_train.copy())
        lat_mean = X_train_temp['latitude'].mean()
        lon_mean = X_train_temp['longitude'].mean()
        
        test_coords_temp = X_test_processed[['latitude', 'longitude']].copy()
        test_coords_temp['latitude'].fillna(lat_mean, inplace=True)
        test_coords_temp['longitude'].fillna(lon_mean, inplace=True)
        
        train_coords_temp = X_train_temp[['latitude', 'longitude']].copy()
        train_coords_temp['latitude'].fillna(lat_mean, inplace=True)
        train_coords_temp['longitude'].fillna(lon_mean, inplace=True)
    
    # XGBoost with optimal clustering
    X_test_xgb = X_test_processed.copy()
    if 'latitude' in X_test_xgb.columns:
        kmeans_xgb = KMeans(n_clusters=xgb_n_clusters, random_state=42, n_init='auto')
        kmeans_xgb.fit(train_coords_temp)
        X_test_xgb['cluster'] = kmeans_xgb.predict(test_coords_temp)
    
    print("Training XGBoost with optimized parameters...")
    xgb_model = train_final_model(X_train, y_train, xgb_params, model_type='xgb')
    pred_xgb = xgb_model.predict(X_test_xgb)
    predictions.append(('xgb_optimized', np.expm1(pred_xgb), 0.45))
    
    # LightGBM with optimal clustering
    X_test_lgb = X_test_processed.copy()
    if 'latitude' in X_test_lgb.columns:
        kmeans_lgb = KMeans(n_clusters=lgb_n_clusters, random_state=42, n_init='auto')
        kmeans_lgb.fit(train_coords_temp)
        X_test_lgb['cluster'] = kmeans_lgb.predict(test_coords_temp)
    
    print("Training LightGBM with optimized parameters...")
    lgb_model = train_final_model(X_train, y_train, lgb_params, model_type='lgb')
    pred_lgb = lgb_model.predict(X_test_lgb)
    predictions.append(('lgb_optimized', np.expm1(pred_lgb), 0.35))
    
    # XGBoost variant
    print("Training XGBoost variant...")
    xgb_variant_params = xgb_params.copy()
    xgb_variant_params['max_depth'] = min(xgb_params['max_depth'] + 2, 15)
    xgb_variant_params['learning_rate'] = xgb_params['learning_rate'] * 0.8
    xgb_variant_params['n_estimators'] = int(xgb_params['n_estimators'] * 1.2)
    
    xgb_variant_model = train_final_model(X_train, y_train, xgb_variant_params, model_type='xgb')
    pred_xgb_variant = xgb_variant_model.predict(X_test_xgb)
    predictions.append(('xgb_variant', np.expm1(pred_xgb_variant), 0.15))
    
    # LightGBM variant
    print("Training LightGBM variant...")
    lgb_variant_params = lgb_params.copy()
    lgb_variant_params['num_leaves'] = int(lgb_params.get('num_leaves', 31) * 0.8)
    lgb_variant_params['learning_rate'] = lgb_params['learning_rate'] * 0.9
    
    lgb_variant_model = train_final_model(X_train, y_train, lgb_variant_params, model_type='lgb')
    pred_lgb_variant = lgb_variant_model.predict(X_test_lgb)
    predictions.append(('lgb_variant', np.expm1(pred_lgb_variant), 0.05))
    
    # Weighted ensemble
    final_pred = np.zeros(len(X_test))
    print("\nEnsemble weights:")
    for name, pred, weight in predictions:
        print(f"  {name}: {weight:.2%}")
        final_pred += weight * pred
    
    return final_pred, predictions

In [10]:
def enhanced_pipeline(train_df, test_df, target_col='pollution_value', 
                     n_trials_xgb=100, n_trials_lgb=100, feature_dir=None):
    """Complete improved pipeline with separate XGBoost and LightGBM optimization"""
    print("="*60)
    print("IMPROVED PIPELINE - NO LEAKAGE, SEPARATE OPTIMIZATION")
    print("="*60)
    
    X_train = train_df.drop(target_col, axis=1)
    y_train = train_df[target_col]
    X_test = test_df.copy()
    
    test_ids = test_df['id'].values
    
    if feature_dir and os.path.exists(feature_dir):
        generators = load_all_feature_generators(feature_dir)
        X_train_generated, X_test_generated = combine_feature_generators(generators, X_train, X_test)
    else:
        print("\nNo existing feature generators found. Using raw features.")
        X_train_generated = X_train
        X_test_generated = X_test
    
    print(f"Total features after generation: {X_train_generated.shape[1]}")
    
    tss = FixedWindowTimeSeriesSplit(n_splits=5, test_size=2700, gap=0, min_train_size=2700)
    
    # Optimize XGBoost
    print("\n" + "="*40)
    print("OPTIMIZING XGBOOST")
    print("="*40)
    
    xgb_sampler = optuna.samplers.TPESampler(
        multivariate=True, group=True, 
        n_startup_trials=int(0.1*n_trials_xgb), 
        constant_liar=True, seed=42
    )
    xgb_study = optuna.create_study(
        direction="maximize", 
        study_name="xgboost_optimization_improved",
        sampler=xgb_sampler,
        storage="sqlite:///xgb_optuna_improved.db",
        load_if_exists=True
    )
    
    xgb_objective_func = lambda trial: xgb_objective(
        trial, X_train_generated, y_train, tss, n_clusters_range=(20, 50)
    )
    xgb_study.optimize(xgb_objective_func, n_trials=n_trials_xgb)
    
    xgb_best_params = xgb_study.best_params.copy()
    print(f"XGBoost best score: {xgb_study.best_value:.4f}")
    print(f"XGBoost best params: {xgb_best_params}")
    
    # Optimize LightGBM
    print("\n" + "="*40)
    print("OPTIMIZING LIGHTGBM")
    print("="*40)
    
    lgb_sampler = optuna.samplers.TPESampler(
        multivariate=True, group=True,
        n_startup_trials=int(0.1*n_trials_lgb),
        constant_liar=True, seed=42
    )
    lgb_study = optuna.create_study(
        direction="maximize",
        study_name="lightgbm_optimization",
        sampler=lgb_sampler,
        storage="sqlite:///lgb_optuna.db",
        load_if_exists=True
    )
    
    lgb_objective_func = lambda trial: lgb_objective(
        trial, X_train_generated, y_train, tss, n_clusters_range=(20, 50)
    )
    lgb_study.optimize(lgb_objective_func, n_trials=n_trials_lgb)
    
    lgb_best_params = lgb_study.best_params.copy()
    print(f"LightGBM best score: {lgb_study.best_value:.4f}")
    print(f"LightGBM best params: {lgb_best_params}")
    
    # Generate ensemble predictions
    print("\n" + "="*40)
    print("GENERATING ENSEMBLE PREDICTIONS")
    print("="*40)
    
    final_predictions, model_predictions = create_optimized_ensemble(
        X_train_generated, y_train, X_test_generated,
        xgb_best_params, lgb_best_params, tss
    )
    
    # Simple post-processing: ensure non-negative
    final_predictions = np.maximum(final_predictions, 0)
    
    # Create submission
    submission = pd.DataFrame({
        'id': test_ids,
        'pollution_value': final_predictions
    })
    
    # Print statistics
    print("\n" + "="*40)
    print("PREDICTION SUMMARY")
    print("="*40)
    print(f"Mean prediction: {final_predictions.mean():.2f}")
    print(f"Std prediction: {final_predictions.std():.2f}")
    print(f"Min prediction: {final_predictions.min():.2f}")
    print(f"Max prediction: {final_predictions.max():.2f}")
    
    print(f"\nTraining target stats:")
    print(f"Mean: {y_train.mean():.2f}")
    print(f"Std: {y_train.std():.2f}")
    print(f"Min: {y_train.min():.2f}")
    print(f"Max: {y_train.max():.2f}")
    
    return submission, xgb_study, lgb_study

In [11]:
# Load data
import pandas as pd
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
year = 2023
train_df['datetime'] = pd.to_datetime(train_df['day_of_year'], format='%j', errors='coerce') \
                       + pd.to_timedelta(train_df['hour'], unit='h')
train_df['datetime'] = train_df['datetime'].apply(
    lambda dt: dt.replace(year=year) if pd.notnull(dt) else dt
)
train_df = train_df.sort_values(by='datetime')
train_df = train_df.drop(columns='datetime')
train_df.reset_index(drop=True, inplace=True) # CRUCIAL #

X_train = train_df.drop(['pollution_value', 'id'], axis=1)
y_train = train_df['pollution_value']

In [None]:
# Stage 1: Run feature generation multiple times
for run_id in range(8, 9):
    X_generated, generator = run_single_feature_generation(
        X_train, y_train, 
        save_dir="model_rredone_10_folds", 
        run_id=run_id,
        max_new_feats=100,
        n_generations=100
    )

Running feature generation run 8...
Starting regression on cpu - 7649 samples, 6 features
Params: gen=100, parents=40, children=200, limit=100
Gen 0: Train rmse_exp=0.91326, Val rmse_exp=0.64187


Generations:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

State saved to model_rredone_10_folds/feature_generator_run_8.pkl
Gen 1: Added 7 features, 13 total (7 new). Train rmse_exp=0.92998, Val rmse_exp=0.65622. Score improved by 0.01435. Status: NONE, Strategy success: HM:0.50, BS:0.50, N:1.00
  Simple: ['longitude_mean_month', 'longitude_square', 'day_of_week_geometric_mean_month', 'hour_angle_between_month', 'longitude_add_day_of_week', 'day_of_year_pow_day_of_week', 'latitude_percent_change_longitude']


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 2: Added 0 features, 13 total (7 new). Train rmse_exp=0.92998, Val rmse_exp=0.65622. No improvement. Status: NONE, Strategy success: HM:0.50, BS:0.50, N:0.50


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 3: Added 0 features, 13 total (7 new). Train rmse_exp=0.92998, Val rmse_exp=0.65622. No improvement. Status: MILD, Strategy success: HM:0.50, BS:0.50, N:0.33


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

State saved to model_rredone_10_folds/feature_generator_run_8.pkl
Gen 4: Added 1 features, 14 total (8 new). Train rmse_exp=0.92843, Val rmse_exp=0.65790. Score improved by 0.00169. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.50
  Simple: ['longitude_mean_month_arccos']


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 5: Added 0 features, 14 total (8 new). Train rmse_exp=0.92843, Val rmse_exp=0.65790. No improvement. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.40


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 6: Added 0 features, 14 total (8 new). Train rmse_exp=0.92843, Val rmse_exp=0.65790. No improvement. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.33


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

State saved to model_rredone_10_folds/feature_generator_run_8.pkl
Gen 7: Added 1 features, 15 total (9 new). Train rmse_exp=0.92936, Val rmse_exp=0.65834. Score improved by 0.00044. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.43
  Simple: ['longitude_sign']


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 8: Added 0 features, 15 total (9 new). Train rmse_exp=0.92936, Val rmse_exp=0.65834. No improvement. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.38


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 9: Added 0 features, 15 total (9 new). Train rmse_exp=0.92936, Val rmse_exp=0.65834. No improvement. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.33


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

Gen 10: Added 0 features, 15 total (9 new). Train rmse_exp=0.92936, Val rmse_exp=0.65834. No improvement. Status: MODERATE, Strategy success: HM:0.50, BS:0.50, N:0.30


Evaluating features:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
# Stage 2: Run complete pipeline
submission, xgb_study, lgb_study = enhanced_pipeline(
    train_df, 
    test_df,
    target_col='pollution_value',
    n_trials_xgb=100,
    n_trials_lgb=100,
    feature_dir="model_rredone",
)

submission.to_csv('submission_enhanced.csv', index=False)
print("Submission saved!")