# Enhanced ML Pipeline - Modular Feature Generation

In [None]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KernelDensity
import xgboost as xgb
import lightgbm as lgb
import optuna
from tabularaml.generate.features import FeatureGenerator
from tabularaml.eval.scorers import Scorer
from spatial_temporal_cv import SpatialTemporalKFold, FixedWindowTimeSeriesSplit
import gc

In [None]:
def rmse_exp(y_true, y_pred):
    return np.exp(-np.sqrt(mean_squared_error(y_true, y_pred))/100)

def competition_score(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return np.exp(-rmse / 100)

def create_cyclical_features(df):
    df_copy = df.copy()
    
    if 'hour' in df_copy.columns:
        df_copy['hour_sin'] = np.sin(2 * np.pi * df_copy['hour'] / 24.0)
        df_copy['hour_cos'] = np.cos(2 * np.pi * df_copy['hour'] / 24.0)
    
    if 'day_of_week' in df_copy.columns:
        df_copy['dow_sin'] = np.sin(2 * np.pi * df_copy['day_of_week'] / 7.0)
        df_copy['dow_cos'] = np.cos(2 * np.pi * df_copy['day_of_week'] / 7.0)
    
    if 'day_of_year' in df_copy.columns:
        df_copy['doy_sin'] = np.sin(2 * np.pi * df_copy['day_of_year'] / 365.0)
        df_copy['doy_cos'] = np.cos(2 * np.pi * df_copy['day_of_year'] / 365.0)
    
    columns_to_drop = ['hour', 'day_of_week', 'day_of_year', 'month']
    existing_columns_to_drop = [col for col in columns_to_drop if col in df_copy.columns]
    if existing_columns_to_drop:
        df_copy = df_copy.drop(columns=existing_columns_to_drop)
    
    return df_copy

In [None]:
def create_enhanced_features(X_train, y_train, X_test):
    X_train_enhanced = X_train.copy()
    X_test_enhanced = X_test.copy()
    
    coords_train = X_train[['latitude', 'longitude']].fillna(X_train[['latitude', 'longitude']].mean())
    coords_test = X_test[['latitude', 'longitude']].fillna(X_train[['latitude', 'longitude']].mean())
    
    kde = KernelDensity(bandwidth=2.0, kernel='gaussian')
    kde.fit(coords_train)
    
    X_train_enhanced['spatial_density'] = np.exp(kde.score_samples(coords_train))
    X_test_enhanced['spatial_density'] = np.exp(kde.score_samples(coords_test))
    
    X_train_enhanced['is_rush_hour'] = X_train['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    X_test_enhanced['is_rush_hour'] = X_test['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    
    X_train_enhanced['is_night'] = ((X_train['hour'] >= 22) | (X_train['hour'] <= 5)).astype(int)
    X_test_enhanced['is_night'] = ((X_test['hour'] >= 22) | (X_test['hour'] <= 5)).astype(int)
    
    X_train_enhanced['lat_hour'] = X_train['latitude'] * X_train['hour']
    X_test_enhanced['lat_hour'] = X_test['latitude'] * X_test['hour']
    
    return X_train_enhanced, X_test_enhanced

In [None]:
def run_single_feature_generation(X_train, y_train, save_dir="./features", run_id=1, max_new_feats=2000, n_generations=2000):
    os.makedirs(save_dir, exist_ok=True)
    
    splitter = SpatialTemporalKFold(n_splits=5, spatial_clusters=30, temporal_clusters=10, random_state=42)
    rmse_exp_scorer = Scorer(name="rmse_exp", scorer=rmse_exp, greater_is_better=True, extra_params={}, from_probs=False)
    
    print(f"Running feature generation run {run_id}...")
    
    generator = FeatureGenerator(
        task="regression",
        scorer=rmse_exp_scorer,
        max_new_feats=max_new_feats,
        cv=splitter,
        n_generations=n_generations,
        save_path=f"{save_dir}/feature_generator_run_{run_id}.pkl",
    )
    
    results = generator.search(X_train, y_train)
    X_generated = generator.transform(X_train)
    
    print(f"Run {run_id} complete: {X_generated.shape[1]} features generated")
    return X_generated, generator

In [None]:
def load_all_feature_generators(feature_dir):
    generator_files = glob.glob(os.path.join(feature_dir, "*.pkl"))
    if not generator_files:
        raise ValueError(f"No feature generator files found in {feature_dir}")
    
    generators = []
    print(f"Loading {len(generator_files)} feature generators from {feature_dir}")
    
    for file_path in sorted(generator_files):
        with open(file_path, 'rb') as f:
            generator = pickle.load(f)
        generators.append(generator)
    
    return generators

def combine_feature_generators(generators, X_train, X_test=None):
    print(f"Combining features from {len(generators)} generators...")
    
    all_features_train = []
    all_features_test = [] if X_test is not None else None
    
    for i, generator in enumerate(generators):
        X_gen_train = generator.transform(X_train)
        all_features_train.append(X_gen_train)
        
        if X_test is not None:
            X_gen_test = generator.transform(X_test)
            all_features_test.append(X_gen_test)
    
    X_combined_train = pd.concat(all_features_train, axis=1)
    X_combined_train = X_combined_train.loc[:, ~X_combined_train.columns.duplicated()]
    
    if X_test is not None:
        X_combined_test = pd.concat(all_features_test, axis=1)
        X_combined_test = X_combined_test.loc[:, ~X_combined_test.columns.duplicated()]
        return X_combined_train, X_combined_test
    
    return X_combined_train

In [None]:
def enhanced_objective(trial, X_base, y_train, tss, n_clusters_range=(20, 50)):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 30),
        'gamma': trial.suggest_float('gamma', 0, 20),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.3, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100.0, log=True),
        'max_leaves': trial.suggest_int('max_leaves', 0, 2000),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'max_bin': trial.suggest_int('max_bin', 32, 512),
        'objective': 'reg:squarederror',
        'enable_categorical': True,
        'random_state': 42,
        'n_jobs': -1
    }
    
    n_clusters = trial.suggest_int('n_clusters', n_clusters_range[0], n_clusters_range[1])
    
    X_base_cyclic = create_cyclical_features(X_base.copy())
    y = np.log1p(y_train.copy())
    
    fold_scores = []
    
    for n_fold, (train_idx, valid_idx) in enumerate(tss.split(X_base_cyclic, y)):
        X_train_fold = X_base_cyclic.iloc[train_idx].copy()
        X_valid_fold = X_base_cyclic.iloc[valid_idx].copy()
        y_train_fold = y.iloc[train_idx]
        y_valid_fold = y.iloc[valid_idx]
        
        if 'latitude' in X_train_fold.columns and 'longitude' in X_train_fold.columns:
            lat_mean = X_train_fold['latitude'].mean()
            lon_mean = X_train_fold['longitude'].mean()
            
            train_coords_temp = X_train_fold[['latitude', 'longitude']].copy()
            valid_coords_temp = X_valid_fold[['latitude', 'longitude']].copy()
            
            train_coords_temp['latitude'].fillna(lat_mean, inplace=True)
            train_coords_temp['longitude'].fillna(lon_mean, inplace=True)
            valid_coords_temp['latitude'].fillna(lat_mean, inplace=True)
            valid_coords_temp['longitude'].fillna(lon_mean, inplace=True)
            
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
            kmeans.fit(train_coords_temp)
            
            X_train_fold['cluster'] = kmeans.predict(train_coords_temp)
            X_valid_fold['cluster'] = kmeans.predict(valid_coords_temp)
        
        model = xgb.XGBRegressor(**params)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], early_stopping_rounds=50, verbose=False)
        
        y_pred_fold = model.predict(X_valid_fold)
        y_pred_orig_scale = np.expm1(y_pred_fold)
        y_valid_orig_scale = np.expm1(y_valid_fold)
        
        exp_score = competition_score(y_valid_orig_scale, y_pred_orig_scale)
        fold_scores.append(exp_score)
    
    return np.mean(fold_scores)

In [None]:
def train_final_model(X_train, y_train, params, model_type='xgb'):
    X_train_cyclic = create_cyclical_features(X_train.copy())
    
    n_clusters = params.get('n_clusters', 30)
    if 'latitude' in X_train_cyclic.columns and 'longitude' in X_train_cyclic.columns:
        lat_mean = X_train_cyclic['latitude'].mean()
        lon_mean = X_train_cyclic['longitude'].mean()
        
        coords_temp = X_train_cyclic[['latitude', 'longitude']].copy()
        coords_temp['latitude'].fillna(lat_mean, inplace=True)
        coords_temp['longitude'].fillna(lon_mean, inplace=True)
        
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
        kmeans.fit(coords_temp)
        X_train_cyclic['cluster'] = kmeans.predict(coords_temp)
    
    y_train_log = np.log1p(y_train)
    
    if model_type == 'xgb':
        model_params = {k: v for k, v in params.items() if k != 'n_clusters'}
        model = xgb.XGBRegressor(**model_params)
        model.fit(X_train_cyclic, y_train_log)
    elif model_type == 'lgb':
        lgb_params = {
            'n_estimators': params.get('n_estimators', 1000),
            'learning_rate': params.get('learning_rate', 0.05),
            'num_leaves': 2 ** params.get('max_depth', 6) - 1,
            'feature_fraction': params.get('colsample_bytree', 0.8),
            'bagging_fraction': params.get('subsample', 0.8),
            'bagging_freq': 1,
            'lambda_l1': params.get('reg_alpha', 0),
            'lambda_l2': params.get('reg_lambda', 1),
            'min_data_in_leaf': int(params.get('min_child_weight', 1)),
            'random_state': 42,
            'verbose': -1
        }
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(X_train_cyclic, y_train_log, verbose=-1)
    
    model.kmeans_ = kmeans if 'kmeans' in locals() else None
    model.lat_mean_ = lat_mean if 'lat_mean' in locals() else None
    model.lon_mean_ = lon_mean if 'lon_mean' in locals() else None
    
    return model

In [None]:
def create_model_ensemble(X_train, y_train, X_test, best_params, tss):
    predictions = []
    
    print("Training XGBoost with best parameters...")
    xgb_model = train_final_model(X_train, y_train, best_params, model_type='xgb')
    pred_xgb = xgb_model.predict(X_test)
    predictions.append(('xgb_best', pred_xgb, 0.7))
    
    print("Training LightGBM...")
    lgb_model = train_final_model(X_train, y_train, best_params, model_type='lgb')
    pred_lgb = lgb_model.predict(X_test)
    predictions.append(('lgb', pred_lgb, 0.2))
    
    print("Training XGBoost variant...")
    xgb_variant_params = best_params.copy()
    xgb_variant_params['max_depth'] = min(best_params['max_depth'] + 2, 15)
    xgb_variant_params['learning_rate'] = best_params['learning_rate'] * 0.8
    xgb_variant_model = train_final_model(X_train, y_train, xgb_variant_params, model_type='xgb')
    pred_xgb_variant = xgb_variant_model.predict(X_test)
    predictions.append(('xgb_variant', pred_xgb_variant, 0.1))
    
    final_pred = np.zeros(len(X_test))
    for name, pred, weight in predictions:
        final_pred += weight * pred
    
    return final_pred, predictions

In [None]:
def advanced_post_processing(train_df, y_train, test_df, predictions):
    train_quantiles = np.percentile(y_train, [10, 25, 50, 75, 90])
    pred_quantiles = np.percentile(predictions, [10, 25, 50, 75, 90])
    
    if abs(train_quantiles[2] - pred_quantiles[2]) > 0.1 * train_quantiles[2]:
        scale = train_quantiles[2] / pred_quantiles[2]
        predictions_adjusted = predictions * scale
    else:
        predictions_adjusted = predictions
    
    test_lat_min, test_lat_max = test_df['latitude'].min(), test_df['latitude'].max()
    train_lat_min, train_lat_max = train_df['latitude'].min(), train_df['latitude'].max()
    
    if test_lat_min < train_lat_min or test_lat_max > train_lat_max:
        extreme_mask = ((test_df['latitude'] < train_lat_min) | (test_df['latitude'] > train_lat_max))
        if extreme_mask.any():
            mean_pred = predictions_adjusted[~extreme_mask].mean()
            predictions_adjusted[extreme_mask] = (0.7 * predictions_adjusted[extreme_mask] + 0.3 * mean_pred)
    
    return predictions_adjusted

In [None]:
def enhanced_pipeline(train_df, test_df, target_col='pollution_value', n_trials=1000, feature_dir=None):
    print("="*60)
    print("ENHANCED PIPELINE - BUILDING ON SUCCESS")
    print("="*60)
    
    X_train = train_df.drop(target_col, axis=1)
    y_train = train_df[target_col]
    X_test = test_df.copy()
    
    test_ids = test_df['id'].values
    
    print("\nApplying enhanced features...")
    X_train_enhanced, X_test_enhanced = create_enhanced_features(X_train, y_train, X_test)
    
    if feature_dir and os.path.exists(feature_dir):
        print(f"\nLoading existing feature generators from {feature_dir}...")
        generators = load_all_feature_generators(feature_dir)
        X_train_generated, X_test_generated = combine_feature_generators(generators, X_train_enhanced, X_test_enhanced)
    else:
        print("\nNo existing feature generators found. Please run Stage 1 first.")
        return None, None, None
    
    print(f"Total features after generation: {X_train_generated.shape[1]}")
    
    print("\nOptimizing hyperparameters...")
    
    tss = FixedWindowTimeSeriesSplit(n_splits=5, test_size=2700, gap=0, min_train_size=2700)
    
    sampler = optuna.samplers.TPESampler(multivariate=True, group=True, n_startup_trials=20, constant_liar=True, seed=42)
    study = optuna.create_study(direction="maximize", study_name="xgboost_optimization_enhanced", sampler=sampler, storage="sqlite:///xgb_optuna_enhanced.db", load_if_exists=True)
    
    objective_func = lambda trial: enhanced_objective(trial, X_train_generated, y_train, tss, n_clusters_range=(20, 50))
    study.optimize(objective_func, n_trials=n_trials)
    
    best_params = study.best_params.copy()
    print(f"\nBest score: {study.best_value:.4f}")
    
    print("\nGenerating final predictions...")
    final_predictions, model_predictions = create_model_ensemble(X_train_generated, y_train, X_test_generated, best_params, tss)
    
    print("\nApplying post-processing...")
    final_predictions = np.maximum(final_predictions, 0)
    final_predictions = advanced_post_processing(train_df, y_train, test_df, final_predictions)
    
    submission = pd.DataFrame({'id': test_ids, 'pollution_value': final_predictions})
    
    return submission, study, best_params

In [None]:
# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X_train = train_df.drop('pollution_value', axis=1)
y_train = train_df['pollution_value']

# Apply enhanced features
X_train_enhanced, X_test_enhanced = create_enhanced_features(X_train, y_train, test_df)

In [None]:
# Stage 1: Run feature generation multiple times
for run_id in range(1, 4):
    X_generated, generator = run_single_feature_generation(
        X_train_enhanced, y_train, 
        save_dir="./features", 
        run_id=run_id,
        max_new_feats=2000,
        n_generations=2000
    )

In [None]:
# Stage 2: Run complete pipeline
submission, study, best_params = enhanced_pipeline(
    train_df, 
    test_df,
    target_col='pollution_value',
    n_trials=1000,
    feature_dir="./features"
)

submission.to_csv('submission_enhanced.csv', index=False)
print("Submission saved!")