In [1]:
# Cell 1: Imports and Setup
import os, gc, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

warnings.filterwarnings("ignore")
np.random.seed(42)

# List input files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:10]:
        print(os.path.join(dirname, filename))

/kaggle/input/eee-g513/train.csv
/kaggle/input/eee-g513/test.csv
/kaggle/input/eee-g513/cost_of_living.csv


In [2]:
# Cell 2: Load Data with Proper Handling
DATA_PATH = "/kaggle/input/eee-g513"

train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
test_raw = pd.read_csv(f"{DATA_PATH}/test.csv")
colv_path = f"{DATA_PATH}/cost_of_living.csv"
colv = pd.read_csv(colv_path) if os.path.exists(colv_path) else None

print(f"Train: {train_raw.shape}, Test: {test_raw.shape}, COL: {colv.shape if colv is not None else None}")
print(f"\nTarget stats:\n{train_raw['salary_average'].describe()}")
print(f"\nMissing in train:\n{train_raw.isnull().sum()[train_raw.isnull().sum() > 0]}")

Train: (6525, 6), Test: (2790, 5), COL: (1528, 56)

Target stats:
count      6480.000000
mean      53169.866584
std       27592.877914
min        3836.556547
25%       28697.390996
50%       54673.064063
75%       72742.000000
max      157747.634644
Name: salary_average, dtype: float64

Missing in train:
salary_average    45
dtype: int64


In [3]:
# Cell 3: Smart COL Merge - More Aggressive Feature Creation
if colv is not None:
    # Identify merge keys
    if 'city_id' in train_raw.columns and 'city_id' in colv.columns:
        keys = ['city_id']
    else:
        keys = ['city', 'country']
    
    # Create MULTIPLE aggregations (mean, median, std) for robustness
    colv_mean = colv.groupby(keys, as_index=False).mean(numeric_only=True)
    colv_median = colv.groupby(keys, as_index=False).median(numeric_only=True)
    colv_std = colv.groupby(keys, as_index=False).std(numeric_only=True)
    
    # Rename std columns
    for c in colv_std.columns:
        if c not in keys:
            colv_std.rename(columns={c: f'{c}_std'}, inplace=True)
    
    # Merge all
    train = train_raw.merge(colv_mean, on=keys, how='left', suffixes=('', '_drop'))
    train = train.merge(colv_median, on=keys, how='left', suffixes=('', '_median'))
    train = train.merge(colv_std, on=keys, how='left')
    
    test = test_raw.merge(colv_mean, on=keys, how='left', suffixes=('', '_drop'))
    test = test.merge(colv_median, on=keys, how='left', suffixes=('', '_median'))
    test = test.merge(colv_std, on=keys, how='left')
    
    # Drop duplicate columns
    train = train[[c for c in train.columns if not c.endswith('_drop')]]
    test = test[[c for c in test.columns if not c.endswith('_drop')]]
else:
    train, test = train_raw.copy(), test_raw.copy()

print(f"After merge - Train: {train.shape}, Test: {test.shape}")

After merge - Train: (6525, 165), Test: (2790, 164)


In [4]:
# Cell 4: Target Cleaning and Feature Selection
TARGET = "salary_average"

# Remove invalid targets
before = len(train)
train = train[train[TARGET].notna() & (train[TARGET] > 0)].reset_index(drop=True)
print(f"Dropped {before - len(train)} invalid targets")

# Identify common features (exclude target)
feat_cols = [c for c in train.columns if c in test.columns and c != TARGET]
print(f"Features: {len(feat_cols)}")

Dropped 45 invalid targets
Features: 164


In [5]:
# Cell 5: Advanced Feature Engineering
def create_interaction_features(df):
    """Create domain-informed interaction features"""
    df = df.copy()
    
    # Find COL columns
    cols_lower = {c.lower(): c for c in df.columns}
    
    def find_col(keywords):
        for kw in keywords:
            for lower, actual in cols_lower.items():
                if kw in lower:
                    return actual
        return None
    
    # Key COL indicators
    ppp = find_col(['purchasing', 'power'])
    rent = find_col(['rent'])
    groc = find_col(['grocer'])
    rest = find_col(['restaurant'])
    trans = find_col(['transport'])
    local = find_col(['local_purchasing'])
    
    # === NEW: More sophisticated ratios ===
    if ppp and rent:
        if pd.api.types.is_numeric_dtype(df[ppp]) and pd.api.types.is_numeric_dtype(df[rent]):
            df['rent_ppp_ratio'] = df[rent] / (df[ppp] + 1)
            df['ppp_rent_sqrt'] = np.sqrt(df[ppp] * df[rent])
    
    if groc and rest:
        if pd.api.types.is_numeric_dtype(df[groc]) and pd.api.types.is_numeric_dtype(df[rest]):
            df['dining_out_ratio'] = df[rest] / (df[groc] + 1)
    
    if rent and groc:
        if pd.api.types.is_numeric_dtype(df[rent]) and pd.api.types.is_numeric_dtype(df[groc]):
            df['housing_food_burden'] = df[rent] + df[groc]
            df['housing_food_ratio'] = df[rent] / (df[groc] + 1)
    
    # === NEW: Composite indices ===
    cost_cols = []
    for col_name in [rent, groc, rest, trans]:
        if col_name and pd.api.types.is_numeric_dtype(df[col_name]):
            cost_cols.append(col_name)
    
    if len(cost_cols) >= 2:
        df['composite_col_mean'] = df[cost_cols].mean(axis=1)
        df['composite_col_max'] = df[cost_cols].max(axis=1)
        df['composite_col_min'] = df[cost_cols].min(axis=1)
        df['composite_col_range'] = df['composite_col_max'] - df['composite_col_min']
    
    # === NEW: PPP-adjusted indices ===
    if ppp and len(cost_cols) >= 2:
        if pd.api.types.is_numeric_dtype(df[ppp]):
            df['affordability_index'] = df['composite_col_mean'] / (df[ppp] + 1)
    
    return df

train = create_interaction_features(train)
test = create_interaction_features(test)

# Update feature list
feat_cols = [c for c in train.columns if c in test.columns and c != TARGET]
print(f"Total features after engineering: {len(feat_cols)}")

Total features after engineering: 164


In [6]:
# Cell 6: Type Detection and Numeric Preprocessing (LESS restrictive)
# Convert numeric-like objects
for c in feat_cols:
    if train[c].dtype == 'object':
        tr_num = pd.to_numeric(train[c], errors='coerce')
        te_num = pd.to_numeric(test[c], errors='coerce')
        if tr_num.notna().mean() > 0.8:  # More lenient
            train[c] = tr_num
            test[c] = te_num

# Separate numeric and categorical
num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

print(f"Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

# LESS aggressive outlier handling (1% and 99% instead of 0.5% and 99.5%)
for c in num_cols:
    q_low, q_high = train[c].quantile([0.01, 0.99])
    train[c] = train[c].clip(q_low, q_high)
    test[c] = test[c].clip(q_low, q_high)

# Only log-transform VERY skewed features (skew > 3)
for c in num_cols:
    if (train[c] > 0).all() and train[c].skew() > 3.0:
        train[c] = np.log1p(train[c])
        test[c] = np.log1p(test[c])

# Impute with median
for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c] = test[c].fillna(med)

Numeric: 160, Categorical: 4


In [9]:
# Cell 7: Smoothed Target Encoding (CV-Safe)
def smoothed_target_encode(train_df, y_train, test_df, col, alpha=50):
    """Bayesian smoothed mean encoding"""
    global_mean = y_train.mean()
    
    stats = pd.DataFrame({
        'key': train_df[col].astype(str),
        'target': y_train
    }).groupby('key').agg(
        sum_y=('target', 'sum'),
        count=('target', 'count')
    )
    
    stats['encoded'] = (stats['sum_y'] + alpha * global_mean) / (stats['count'] + alpha)
    encoded = test_df[col].astype(str).map(stats['encoded']).fillna(global_mean)
    
    return encoded

def add_target_encodings(X_train, y_train, X_target):
    """Add all target encodings"""
    X_out = X_target.copy()
    
    if 'country' in X_train.columns:
        X_out['te_country'] = smoothed_target_encode(X_train, y_train, X_out, 'country', alpha=100)
    
    if 'role' in X_train.columns:
        X_out['te_role'] = smoothed_target_encode(X_train, y_train, X_out, 'role', alpha=50)
    
    if 'state' in X_train.columns:
        X_out['te_state'] = smoothed_target_encode(X_train, y_train, X_out, 'state', alpha=30)
    
    if {'country', 'role'}.issubset(X_train.columns):
        X_train_temp = X_train.copy()
        X_out_temp = X_out.copy()
        X_train_temp['country_role'] = X_train['country'].astype(str) + '_' + X_train['role'].astype(str)
        X_out_temp['country_role'] = X_out['country'].astype(str) + '_' + X_out['role'].astype(str)
        X_out['te_country_role'] = smoothed_target_encode(X_train_temp, y_train, X_out_temp, 'country_role', alpha=200)
    
    return X_out

In [10]:
# Cell 8: Setup Cross-Validation
group_key = 'city_id' if 'city_id' in train.columns else ('city' if 'city' in train.columns else 'country')
groups = train[group_key] if group_key else pd.Series(range(len(train)))

gkf = GroupKFold(n_splits=5)

def rmspe(y_true, y_pred, eps=1e-6):
    """Root Mean Square Percentage Error"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = (y_true > eps) & np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        return np.inf
    return np.sqrt(np.mean(((y_true[mask] - y_pred[mask]) / y_true[mask]) ** 2))

print(f"CV Strategy: GroupKFold on '{group_key}' with {gkf.get_n_splits()} folds")

CV Strategy: GroupKFold on 'city_id' with 5 folds


In [11]:
# Cell 9: LightGBM with LESS Conservative Hyperparameters
lgb_oof = np.zeros(len(train))
lgb_test = np.zeros(len(test))
y_oof_true = np.zeros(len(train))

lgb_params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'learning_rate': 0.03,  # Slightly higher
    'num_leaves': 31,  # More complex
    'min_data_in_leaf': 80,  # Less regularization
    'feature_fraction': 0.8,  # Use more features
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.5,  # Much less L1
    'lambda_l2': 1.0,  # Much less L2
    'max_depth': -1,  # Unlimited
    'min_gain_to_split': 0.001,  # Allow more splits
    'verbosity': -1,
    'seed': 42
}

print("\n=== LightGBM Training ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    X_tr_enc = add_target_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_target_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_target_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    cat_features = [c for c in X_tr_enc.columns if not pd.api.types.is_numeric_dtype(X_tr_enc[c])]
    for c in cat_features:
        X_tr_enc[c] = X_tr_enc[c].astype('category')
        X_va_enc[c] = X_va_enc[c].astype('category')
        X_te_enc[c] = X_te_enc[c].astype('category')
    
    dtrain = lgb.Dataset(X_tr_enc, label=np.log1p(y_tr), categorical_feature=cat_features)
    dvalid = lgb.Dataset(X_va_enc, label=np.log1p(y_va), categorical_feature=cat_features)
    
    model = lgb.train(
        lgb_params,
        dtrain,
        valid_sets=[dtrain, dvalid],
        num_boost_round=10000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=300, verbose=False),  # Less patience
            lgb.log_evaluation(period=0)
        ]
    )
    
    va_pred = np.expm1(model.predict(X_va_enc, num_iteration=model.best_iteration))
    te_pred = np.expm1(model.predict(X_te_enc, num_iteration=model.best_iteration))
    
    lgb_oof[va_idx] = va_pred
    lgb_test += te_pred / gkf.n_splits
    y_oof_true[va_idx] = y_va
    
    fold_score = rmspe(y_va, va_pred)
    print(f"Fold {fold}: RMSPE = {fold_score:.4f} | Best Iteration = {model.best_iteration}")

lgb_cv_score = rmspe(y_oof_true, lgb_oof)
print(f"\nLGBM OOF RMSPE: {lgb_cv_score:.4f}")


=== LightGBM Training ===
Fold 1: RMSPE = 0.2906 | Best Iteration = 302
Fold 2: RMSPE = 0.9011 | Best Iteration = 1690
Fold 3: RMSPE = 0.7019 | Best Iteration = 496
Fold 4: RMSPE = 0.6947 | Best Iteration = 1341
Fold 5: RMSPE = 0.6669 | Best Iteration = 352

LGBM OOF RMSPE: 0.6502


In [12]:
# Cell 10: CatBoost with Less Conservative Settings
cb_oof = np.zeros(len(train))
cb_test = np.zeros(len(test))

print("\n=== CatBoost Training ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    X_tr_enc = add_target_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_target_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_target_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    cat_features = []
    for i, c in enumerate(X_tr_enc.columns):
        if not pd.api.types.is_numeric_dtype(X_tr_enc[c]):
            cat_features.append(i)
            for df in [X_tr_enc, X_va_enc, X_te_enc]:
                df[c] = df[c].astype(str).fillna('Unknown')
    
    train_pool = Pool(X_tr_enc, np.log1p(y_tr), cat_features=cat_features)
    valid_pool = Pool(X_va_enc, np.log1p(y_va), cat_features=cat_features)
    test_pool = Pool(X_te_enc, cat_features=cat_features)
    
    model = CatBoostRegressor(
        loss_function='RMSE',
        learning_rate=0.03,  # Slightly higher
        depth=6,  # More depth
        l2_leaf_reg=3.0,  # Less regularization
        iterations=15000,
        early_stopping_rounds=500,  # Less patience
        random_seed=42,
        verbose=False,
        bootstrap_type='Bayesian',  # Try Bayesian bootstrap
        bagging_temperature=0.5  # Add some randomness
    )
    
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    
    va_pred = np.expm1(model.predict(valid_pool))
    te_pred = np.expm1(model.predict(test_pool))
    
    cb_oof[va_idx] = va_pred
    cb_test += te_pred / gkf.n_splits
    
    fold_score = rmspe(y_va, va_pred)
    print(f"Fold {fold}: RMSPE = {fold_score:.4f} | Best Iteration = {model.get_best_iteration()}")

cb_cv_score = rmspe(y_oof_true, cb_oof)
print(f"\nCatBoost OOF RMSPE: {cb_cv_score:.4f}")


=== CatBoost Training ===
Fold 1: RMSPE = 0.3332 | Best Iteration = 558
Fold 2: RMSPE = 1.0070 | Best Iteration = 371
Fold 3: RMSPE = 0.7944 | Best Iteration = 317
Fold 4: RMSPE = 1.0165 | Best Iteration = 386
Fold 5: RMSPE = 0.6828 | Best Iteration = 382

CatBoost OOF RMSPE: 0.7701


In [13]:
# Cell 11: XGBoost with Less Regularization
xgb_oof = np.zeros(len(train))
xgb_test = np.zeros(len(test))

print("\n=== XGBoost Training ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    X_tr_enc = add_target_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_target_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_target_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    for c in X_tr_enc.columns:
        if not pd.api.types.is_numeric_dtype(X_tr_enc[c]):
            for df in [X_tr_enc, X_va_enc, X_te_enc]:
                df[c] = df[c].astype('category').cat.codes
    
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        learning_rate=0.03,  # Slightly higher
        max_depth=6,  # More depth
        subsample=0.8,  # Use more data
        colsample_bytree=0.8,  # Use more features
        reg_alpha=0.5,  # Much less L1
        reg_lambda=1.0,  # Much less L2
        n_estimators=15000,
        early_stopping_rounds=500,  # Less patience
        random_state=42,
        tree_method='hist',
        gamma=0.01  # Allow more splits
    )
    
    model.fit(
        X_tr_enc, np.log1p(y_tr),
        eval_set=[(X_va_enc, np.log1p(y_va))],
        verbose=False
    )
    
    va_pred = np.expm1(model.predict(X_va_enc))
    te_pred = np.expm1(model.predict(X_te_enc))
    
    xgb_oof[va_idx] = va_pred
    xgb_test += te_pred / gkf.n_splits
    
    fold_score = rmspe(y_va, va_pred)
    print(f"Fold {fold}: RMSPE = {fold_score:.4f}")

xgb_cv_score = rmspe(y_oof_true, xgb_oof)
print(f"\nXGBoost OOF RMSPE: {xgb_cv_score:.4f}")


=== XGBoost Training ===
Fold 1: RMSPE = 0.3518
Fold 2: RMSPE = 1.0184
Fold 3: RMSPE = 0.7420
Fold 4: RMSPE = 1.1227
Fold 5: RMSPE = 0.7584

XGBoost OOF RMSPE: 0.8039


In [15]:
# Cell 12: Smart Weighted Ensemble Based on OOF Performance
print("\n=== Ensemble Strategy ===")

# Calculate weights inversely proportional to OOF RMSPE
scores = np.array([lgb_cv_score, cb_cv_score, xgb_cv_score])
print(f"Individual OOF Scores:")
print(f"  LGBM:     {lgb_cv_score:.4f}")
print(f"  CatBoost: {cb_cv_score:.4f}")
print(f"  XGBoost:  {xgb_cv_score:.4f}")

# Inverse weighting (lower error = higher weight)
inv_scores = 1.0 / scores
weights = inv_scores / inv_scores.sum()
print(f"\nCalculated Weights:")
print(f"  LGBM:     {weights[0]:.4f}")
print(f"  CatBoost: {weights[1]:.4f}")
print(f"  XGBoost:  {weights[2]:.4f}")

# Weighted blend
weighted_oof = (lgb_oof * weights[0] + 
                cb_oof * weights[1] + 
                xgb_oof * weights[2])
weighted_test = (lgb_test * weights[0] + 
                 cb_test * weights[1] + 
                 xgb_test * weights[2])

weighted_score = rmspe(y_oof_true, weighted_oof)
print(f"\nWeighted Blend OOF RMSPE: {weighted_score:.4f}")

# Also try simple average
simple_blend_oof = (lgb_oof + cb_oof + xgb_oof) / 3
simple_blend_test = (lgb_test + cb_test + xgb_test) / 3
simple_score = rmspe(y_oof_true, simple_blend_oof)
print(f"Simple Average OOF RMSPE:  {simple_score:.4f}")

# Try Ridge stacking
oof_meta = np.column_stack([
    np.log1p(np.clip(lgb_oof, 0, None)),
    np.log1p(np.clip(cb_oof, 0, None)),
    np.log1p(np.clip(xgb_oof, 0, None))
])

test_meta = np.column_stack([
    np.log1p(np.clip(lgb_test, 0, None)),
    np.log1p(np.clip(cb_test, 0, None)),
    np.log1p(np.clip(xgb_test, 0, None))
])

scaler = RobustScaler()
oof_meta_scaled = scaler.fit_transform(oof_meta)
test_meta_scaled = scaler.transform(test_meta)

meta_model = Ridge(alpha=0.5, random_state=42)  # Less regularization
meta_model.fit(oof_meta_scaled, np.log1p(y_oof_true))

stacked_oof = np.expm1(meta_model.predict(oof_meta_scaled))
stacked_test = np.expm1(meta_model.predict(test_meta_scaled))
stacked_score = rmspe(y_oof_true, stacked_oof)
print(f"Ridge Stack OOF RMSPE:     {stacked_score:.4f}")

# Choose best approach
all_scores = {
    'LGBM': (lgb_cv_score, lgb_test),
    'CatBoost': (cb_cv_score, cb_test),
    'XGBoost': (xgb_cv_score, xgb_test),
    'Weighted': (weighted_score, weighted_test),
    'Simple': (simple_score, simple_blend_test),
    'Stacked': (stacked_score, stacked_test)
}

best_method = min(all_scores.items(), key=lambda x: x[1][0])
final_pred = best_method[1][1]
final_score = best_method[1][0]

print(f"\n{'='*50}")
print(f">>> SELECTED: {best_method[0]} (OOF RMSPE: {final_score:.4f}) <<<")
print(f"{'='*50}")


=== Ensemble Strategy ===
Individual OOF Scores:
  LGBM:     0.6502
  CatBoost: 0.7701
  XGBoost:  0.8039

Calculated Weights:
  LGBM:     0.3769
  CatBoost: 0.3182
  XGBoost:  0.3049

Weighted Blend OOF RMSPE: 0.7294
Simple Average OOF RMSPE:  0.7357
Ridge Stack OOF RMSPE:     0.4438

>>> SELECTED: Stacked (OOF RMSPE: 0.4438) <<<


In [16]:
# Cell 13: Create Submission
id_col = next((c for c in ['ID', 'id', 'Id'] if c in test_raw.columns), None)

if id_col:
    test_ids = test_raw[id_col].values
else:
    test_ids = np.arange(1, len(test_raw) + 1)

submission = pd.DataFrame({
    'ID': test_ids,
    'salary_average': final_pred
})

# Validate
print(f"\nSubmission Validation:")
print(f"  Shape: {submission.shape}")
print(f"  Expected rows: {len(test_raw)}")
print(f"  Columns: {list(submission.columns)}")
print(f"  Min prediction: ${submission['salary_average'].min():,.2f}")
print(f"  Max prediction: ${submission['salary_average'].max():,.2f}")
print(f"  Mean prediction: ${submission['salary_average'].mean():,.2f}")
print(f"  Median prediction: ${submission['salary_average'].median():,.2f}")

assert submission.shape[0] == len(test_raw), "Row count mismatch!"
assert list(submission.columns) == ['ID', 'salary_average'], "Column mismatch!"
assert submission['salary_average'].notna().all(), "NaN predictions found!"
assert (submission['salary_average'] > 0).all(), "Non-positive predictions found!"

# Save
submission.to_csv('submission.csv', index=False)
print("\n✓ Submission saved: submission.csv")
print("\nFirst 10 rows:")
print(submission.head(10))


Submission Validation:
  Shape: (2790, 2)
  Expected rows: 2790
  Columns: ['ID', 'salary_average']
  Min prediction: $4,274.73
  Max prediction: $121,069.87
  Mean prediction: $53,029.34
  Median prediction: $49,504.23

✓ Submission saved: submission.csv

First 10 rows:
   ID  salary_average
0   1    82789.993766
1   2    85912.984566
2   3    91776.589220
3   4    58337.471630
4   5    50452.390180
5   6    61018.806824
6   7    50738.116998
7   8    59353.407355
8   9    86528.310295
9  10    89281.654768


In [17]:
# Cell 14: Cleanup and Summary
gc.collect()

print("\n" + "="*60)
print("PIPELINE COMPLETE - SUMMARY")
print("="*60)
print(f"Final Method:     {best_method[0]}")
print(f"OOF RMSPE:        {final_score:.4f}")
print(f"Features Used:    {len(feat_cols)}")
print(f"Training Samples: {len(train)}")
print(f"Test Samples:     {len(test)}")
print("="*60)


PIPELINE COMPLETE - SUMMARY
Final Method:     Stacked
OOF RMSPE:        0.4438
Features Used:    164
Training Samples: 6480
Test Samples:     2790
