In [1]:
# Cell 1: Imports and Setup
import os, gc, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

warnings.filterwarnings("ignore")
np.random.seed(42)

# List input files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:10]:
        print(os.path.join(dirname, filename))

/kaggle/input/eee-g513/train.csv
/kaggle/input/eee-g513/test.csv
/kaggle/input/eee-g513/cost_of_living.csv


In [2]:
# Cell 2: Load Data with Proper Handling
DATA_PATH = "/kaggle/input/eee-g513"

train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
test_raw = pd.read_csv(f"{DATA_PATH}/test.csv")
colv_path = f"{DATA_PATH}/cost_of_living.csv"
colv = pd.read_csv(colv_path) if os.path.exists(colv_path) else None

print(f"Train: {train_raw.shape}, Test: {test_raw.shape}, COL: {colv.shape if colv is not None else None}")
print(f"\nTarget stats:\n{train_raw['salary_average'].describe()}")
print(f"\nMissing in train:\n{train_raw.isnull().sum()[train_raw.isnull().sum() > 0]}")


Train: (6525, 6), Test: (2790, 5), COL: (1528, 56)

Target stats:
count      6480.000000
mean      53169.866584
std       27592.877914
min        3836.556547
25%       28697.390996
50%       54673.064063
75%       72742.000000
max      157747.634644
Name: salary_average, dtype: float64

Missing in train:
salary_average    45
dtype: int64


In [3]:
# Cell 3: Smart COL Merge - Use Median Aggregation Per City
if colv is not None:
    # Identify merge keys
    if 'city_id' in train_raw.columns and 'city_id' in colv.columns:
        keys = ['city_id']
    else:
        keys = ['city', 'country']
    
    # Aggregate COL with median (more robust than mean)
    colv_agg = colv.groupby(keys, as_index=False).median(numeric_only=True)
    
    train = train_raw.merge(colv_agg, on=keys, how='left', suffixes=('', '_drop'))
    test = test_raw.merge(colv_agg, on=keys, how='left', suffixes=('', '_drop'))
    
    # Drop duplicate columns
    train = train[[c for c in train.columns if not c.endswith('_drop')]]
    test = test[[c for c in test.columns if not c.endswith('_drop')]]
else:
    train, test = train_raw.copy(), test_raw.copy()

print(f"After merge - Train: {train.shape}, Test: {test.shape}")

After merge - Train: (6525, 59), Test: (2790, 58)


In [4]:
# Cell 4: Target Cleaning and Feature Selection
TARGET = "salary_average"

# Remove invalid targets
before = len(train)
train = train[train[TARGET].notna() & (train[TARGET] > 0)].reset_index(drop=True)
print(f"Dropped {before - len(train)} invalid targets")

# Identify common features (exclude target)
feat_cols = [c for c in train.columns if c in test.columns and c != TARGET]
print(f"Features: {len(feat_cols)}")

Dropped 45 invalid targets
Features: 58


In [5]:
# Cell 5: Advanced Feature Engineering
def create_interaction_features(df):
    """Create domain-informed interaction features"""
    df = df.copy()
    
    # Find COL columns by partial name matching (case-insensitive)
    cols_lower = {c.lower(): c for c in df.columns}
    
    def find_col(keywords):
        for kw in keywords:
            for lower, actual in cols_lower.items():
                if kw in lower:
                    return actual
        return None
    
    # Key COL indicators
    ppp = find_col(['purchasing', 'power'])
    rent = find_col(['rent'])
    groc = find_col(['grocer'])
    rest = find_col(['restaurant'])
    trans = find_col(['transport'])
    local = find_col(['local_purchasing'])
    
    # Purchasing power ratios (critical for salary normalization)
    if ppp is not None and pd.api.types.is_numeric_dtype(df[ppp]):
        if rent is not None and pd.api.types.is_numeric_dtype(df[rent]):
            df['affordability_rent'] = df[rent] / (df[ppp] + 1)
        if groc is not None and pd.api.types.is_numeric_dtype(df[groc]):
            df['affordability_food'] = df[groc] / (df[ppp] + 1)
        if trans is not None and pd.api.types.is_numeric_dtype(df[trans]):
            df['affordability_trans'] = df[trans] / (df[ppp] + 1)
    
    # Cost burden ratios
    if rent is not None and groc is not None:
        if pd.api.types.is_numeric_dtype(df[rent]) and pd.api.types.is_numeric_dtype(df[groc]):
            df['housing_to_food'] = df[rent] / (df[groc] + 1)
    
    # Composite cost of living index
    cost_cols = []
    for col_name in [rent, groc, rest, trans]:
        if col_name and pd.api.types.is_numeric_dtype(df[col_name]):
            cost_cols.append(col_name)
    
    if len(cost_cols) >= 2:
        df['composite_col_index'] = df[cost_cols].mean(axis=1)
    
    return df

train = create_interaction_features(train)
test = create_interaction_features(test)

# Update feature list
feat_cols = [c for c in train.columns if c in test.columns and c != TARGET]

In [6]:
# Cell 6: Type Detection and Numeric Preprocessing
# Convert numeric-like objects
for c in feat_cols:
    if train[c].dtype == 'object':
        tr_num = pd.to_numeric(train[c], errors='coerce')
        te_num = pd.to_numeric(test[c], errors='coerce')
        if tr_num.notna().mean() > 0.9:  # Stricter threshold
            train[c] = tr_num
            test[c] = te_num

# Separate numeric and categorical
num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

print(f"Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

# Robust outlier handling (winsorize at 0.5% and 99.5%)
for c in num_cols:
    q_low, q_high = train[c].quantile([0.005, 0.995])
    train[c] = train[c].clip(q_low, q_high)
    test[c] = test[c].clip(q_low, q_high)

# Log-transform heavily skewed features
for c in num_cols:
    if (train[c] > 0).all() and train[c].skew() > 2.0:  # Higher threshold
        train[c] = np.log1p(train[c])
        test[c] = np.log1p(test[c])

# Impute with median
for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c] = test[c].fillna(med)

Numeric: 54, Categorical: 4


In [7]:
# Cell 7: Smoothed Target Encoding (CV-Safe)
def smoothed_target_encode(train_df, y_train, test_df, col, alpha=50):
    """
    Bayesian smoothed mean encoding with proper CV to prevent leakage
    """
    global_mean = y_train.mean()
    
    # Compute smoothed means on training data
    stats = pd.DataFrame({
        'key': train_df[col].astype(str),
        'target': y_train
    }).groupby('key').agg(
        sum_y=('target', 'sum'),
        count=('target', 'count')
    )
    
    # Smoothing: (sum_y + alpha * global_mean) / (count + alpha)
    stats['encoded'] = (stats['sum_y'] + alpha * global_mean) / (stats['count'] + alpha)
    
    # Map to test data
    encoded = test_df[col].astype(str).map(stats['encoded']).fillna(global_mean)
    
    return encoded

def add_target_encodings(X_train, y_train, X_target):
    """Add all target encodings"""
    X_out = X_target.copy()
    
    if 'country' in X_train.columns:
        X_out['te_country'] = smoothed_target_encode(X_train, y_train, X_out, 'country', alpha=100)
    
    if 'role' in X_train.columns:
        X_out['te_role'] = smoothed_target_encode(X_train, y_train, X_out, 'role', alpha=50)
    
    if 'state' in X_train.columns:
        X_out['te_state'] = smoothed_target_encode(X_train, y_train, X_out, 'state', alpha=30)
    
    # Interaction: country x role
    if {'country', 'role'}.issubset(X_train.columns):
        X_train_temp = X_train.copy()
        X_out_temp = X_out.copy()
        X_train_temp['country_role'] = X_train['country'].astype(str) + '_' + X_train['role'].astype(str)
        X_out_temp['country_role'] = X_out['country'].astype(str) + '_' + X_out['role'].astype(str)
        X_out['te_country_role'] = smoothed_target_encode(X_train_temp, y_train, X_out_temp, 'country_role', alpha=200)
    
    return X_out


In [8]:
# Cell 8: Setup Cross-Validation
group_key = 'city_id' if 'city_id' in train.columns else ('city' if 'city' in train.columns else 'country')
groups = train[group_key] if group_key else pd.Series(range(len(train)))

gkf = GroupKFold(n_splits=5)

def rmspe(y_true, y_pred, eps=1e-6):
    """Root Mean Square Percentage Error"""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = (y_true > eps) & np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        return np.inf
    return np.sqrt(np.mean(((y_true[mask] - y_pred[mask]) / y_true[mask]) ** 2))

print(f"CV Strategy: GroupKFold on '{group_key}' with {gkf.get_n_splits()} folds")


CV Strategy: GroupKFold on 'city_id' with 5 folds


In [9]:
# Cell 9: LightGBM with Conservative Hyperparameters
lgb_oof = np.zeros(len(train))
lgb_test = np.zeros(len(test))
y_oof_true = np.zeros(len(train))

lgb_params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'learning_rate': 0.02,  # Lower learning rate
    'num_leaves': 24,  # Reduced complexity
    'min_data_in_leaf': 150,  # More regularization
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'lambda_l1': 2.0,  # Increased L1
    'lambda_l2': 5.0,  # Increased L2
    'max_depth': 6,  # Limit depth
    'min_gain_to_split': 0.01,
    'verbosity': -1,
    'seed': 42
}

print("\n=== LightGBM Training ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    # Add target encodings (CV-safe)
    X_tr_enc = add_target_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_target_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_target_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    # Categorical handling
    cat_features = [c for c in X_tr_enc.columns if not pd.api.types.is_numeric_dtype(X_tr_enc[c])]
    for c in cat_features:
        X_tr_enc[c] = X_tr_enc[c].astype('category')
        X_va_enc[c] = X_va_enc[c].astype('category')
        X_te_enc[c] = X_te_enc[c].astype('category')
    
    # Train on log-scale target
    dtrain = lgb.Dataset(X_tr_enc, label=np.log1p(y_tr), categorical_feature=cat_features)
    dvalid = lgb.Dataset(X_va_enc, label=np.log1p(y_va), categorical_feature=cat_features)
    
    model = lgb.train(
        lgb_params,
        dtrain,
        valid_sets=[dtrain, dvalid],
        num_boost_round=10000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=500, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )
    
    # Predict
    va_pred = np.expm1(model.predict(X_va_enc, num_iteration=model.best_iteration))
    te_pred = np.expm1(model.predict(X_te_enc, num_iteration=model.best_iteration))
    
    lgb_oof[va_idx] = va_pred
    lgb_test += te_pred / gkf.n_splits
    y_oof_true[va_idx] = y_va
    
    fold_score = rmspe(y_va, va_pred)
    print(f"Fold {fold}: RMSPE = {fold_score:.4f} | Best Iteration = {model.best_iteration}")

lgb_cv_score = rmspe(y_oof_true, lgb_oof)
print(f"\nLGBM OOF RMSPE: {lgb_cv_score:.4f}")


=== LightGBM Training ===
Fold 1: RMSPE = 0.2513 | Best Iteration = 368
Fold 2: RMSPE = 1.2001 | Best Iteration = 880
Fold 3: RMSPE = 0.5416 | Best Iteration = 669
Fold 4: RMSPE = 0.6738 | Best Iteration = 599
Fold 5: RMSPE = 0.5367 | Best Iteration = 594

LGBM OOF RMSPE: 0.6787


In [10]:
# Cell 10: CatBoost with Conservative Settings
cb_oof = np.zeros(len(train))
cb_test = np.zeros(len(test))

print("\n=== CatBoost Training ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    # Add target encodings
    X_tr_enc = add_target_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_target_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_target_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    # Ensure categorical columns are strings
    cat_features = []
    for i, c in enumerate(X_tr_enc.columns):
        if not pd.api.types.is_numeric_dtype(X_tr_enc[c]):
            cat_features.append(i)
            for df in [X_tr_enc, X_va_enc, X_te_enc]:
                df[c] = df[c].astype(str).fillna('Unknown')
    
    # Create pools
    train_pool = Pool(X_tr_enc, np.log1p(y_tr), cat_features=cat_features)
    valid_pool = Pool(X_va_enc, np.log1p(y_va), cat_features=cat_features)
    test_pool = Pool(X_te_enc, cat_features=cat_features)
    
    model = CatBoostRegressor(
        loss_function='RMSE',
        learning_rate=0.02,
        depth=5,  # Reduced depth
        l2_leaf_reg=10.0,  # More regularization
        iterations=15000,
        early_stopping_rounds=700,
        random_seed=42,
        verbose=False
    )
    
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    
    va_pred = np.expm1(model.predict(valid_pool))
    te_pred = np.expm1(model.predict(test_pool))
    
    cb_oof[va_idx] = va_pred
    cb_test += te_pred / gkf.n_splits
    
    fold_score = rmspe(y_va, va_pred)
    print(f"Fold {fold}: RMSPE = {fold_score:.4f} | Best Iteration = {model.get_best_iteration()}")

cb_cv_score = rmspe(y_oof_true, cb_oof)
print(f"\nCatBoost OOF RMSPE: {cb_cv_score:.4f}")


=== CatBoost Training ===
Fold 1: RMSPE = 0.3212 | Best Iteration = 1425
Fold 2: RMSPE = 1.2703 | Best Iteration = 786
Fold 3: RMSPE = 0.7586 | Best Iteration = 435
Fold 4: RMSPE = 0.8701 | Best Iteration = 615
Fold 5: RMSPE = 0.6922 | Best Iteration = 487

CatBoost OOF RMSPE: 0.8009


In [11]:
# Cell 11: XGBoost for Diversity
xgb_oof = np.zeros(len(train))
xgb_test = np.zeros(len(test))

print("\n=== XGBoost Training ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    X_tr_enc = add_target_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_target_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_target_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    # Encode categoricals as integers
    for c in X_tr_enc.columns:
        if not pd.api.types.is_numeric_dtype(X_tr_enc[c]):
            for df in [X_tr_enc, X_va_enc, X_te_enc]:
                df[c] = df[c].astype('category').cat.codes
    
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        learning_rate=0.02,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_alpha=2.0,
        reg_lambda=5.0,
        n_estimators=15000,
        early_stopping_rounds=700,
        random_state=42,
        tree_method='hist'
    )
    
    model.fit(
        X_tr_enc, np.log1p(y_tr),
        eval_set=[(X_va_enc, np.log1p(y_va))],
        verbose=False
    )
    
    va_pred = np.expm1(model.predict(X_va_enc))
    te_pred = np.expm1(model.predict(X_te_enc))
    
    xgb_oof[va_idx] = va_pred
    xgb_test += te_pred / gkf.n_splits
    
    fold_score = rmspe(y_va, va_pred)
    print(f"Fold {fold}: RMSPE = {fold_score:.4f}")

xgb_cv_score = rmspe(y_oof_true, xgb_oof)
print(f"\nXGBoost OOF RMSPE: {xgb_cv_score:.4f}")


=== XGBoost Training ===
Fold 1: RMSPE = 0.3281
Fold 2: RMSPE = 1.3218
Fold 3: RMSPE = 0.6852
Fold 4: RMSPE = 0.7783
Fold 5: RMSPE = 0.6670

XGBoost OOF RMSPE: 0.7840


In [12]:
# Cell 12: Ensemble with Ridge Stacking
print("\n=== Ensemble Stacking ===")

# Create meta-features (log-scale for stability)
oof_meta = np.column_stack([
    np.log1p(np.clip(lgb_oof, 0, None)),
    np.log1p(np.clip(cb_oof, 0, None)),
    np.log1p(np.clip(xgb_oof, 0, None))
])

test_meta = np.column_stack([
    np.log1p(np.clip(lgb_test, 0, None)),
    np.log1p(np.clip(cb_test, 0, None)),
    np.log1p(np.clip(xgb_test, 0, None))
])

# Robust scaling
scaler = RobustScaler()
oof_meta_scaled = scaler.fit_transform(oof_meta)
test_meta_scaled = scaler.transform(test_meta)

# Train meta-model
meta_model = Ridge(alpha=1.0, random_state=42)
meta_model.fit(oof_meta_scaled, np.log1p(y_oof_true))

# Final predictions
stacked_oof = np.expm1(meta_model.predict(oof_meta_scaled))
stacked_test = np.expm1(meta_model.predict(test_meta_scaled))

# Compare all approaches
print("\n=== Cross-Validation Results ===")
print(f"LGBM OOF:        {lgb_cv_score:.4f}")
print(f"CatBoost OOF:    {cb_cv_score:.4f}")
print(f"XGBoost OOF:     {xgb_cv_score:.4f}")

simple_blend = (lgb_oof + cb_oof + xgb_oof) / 3
blend_score = rmspe(y_oof_true, simple_blend)
print(f"Simple Blend:    {blend_score:.4f}")

stacked_score = rmspe(y_oof_true, stacked_oof)
print(f"Stacked Ridge:   {stacked_score:.4f}")

# Choose best approach
if stacked_score < min(lgb_cv_score, cb_cv_score, xgb_cv_score, blend_score):
    final_pred = stacked_test
    final_method = "Stacked Ridge"
    final_score = stacked_score
elif blend_score < min(lgb_cv_score, cb_cv_score, xgb_cv_score):
    final_pred = (lgb_test + cb_test + xgb_test) / 3
    final_method = "Simple Blend"
    final_score = blend_score
else:
    # Use best single model
    best_idx = np.argmin([lgb_cv_score, cb_cv_score, xgb_cv_score])
    final_pred = [lgb_test, cb_test, xgb_test][best_idx]
    final_method = ["LGBM", "CatBoost", "XGBoost"][best_idx]
    final_score = [lgb_cv_score, cb_cv_score, xgb_cv_score][best_idx]

print(f"\n>>> Selected: {final_method} (OOF RMSPE: {final_score:.4f}) <<<")


=== Ensemble Stacking ===

=== Cross-Validation Results ===
LGBM OOF:        0.6787
CatBoost OOF:    0.8009
XGBoost OOF:     0.7840
Simple Blend:    0.7499
Stacked Ridge:   0.5322

>>> Selected: Stacked Ridge (OOF RMSPE: 0.5322) <<<


In [13]:
# Cell 13: Create Submission (NO TRIMMING - Use All Test Rows)
# Get actual test IDs
id_col = next((c for c in ['ID', 'id', 'Id'] if c in test_raw.columns), None)

if id_col:
    test_ids = test_raw[id_col].values
else:
    test_ids = np.arange(1, len(test_raw) + 1)

# Create submission with ALL test rows
submission = pd.DataFrame({
    'ID': test_ids,
    'salary_average': final_pred
})

# Validate
print(f"\nSubmission shape: {submission.shape}")
print(f"Expected rows: {len(test_raw)}")
print(f"Columns: {list(submission.columns)}")
assert submission.shape[0] == len(test_raw), "Row count mismatch!"
assert list(submission.columns) == ['ID', 'salary_average'], "Column mismatch!"

# Save
submission.to_csv('submission.csv', index=False)
print("\n✓ Submission saved: submission.csv")
print(submission.head(10))



Submission shape: (2790, 2)
Expected rows: 2790
Columns: ['ID', 'salary_average']

✓ Submission saved: submission.csv
   ID  salary_average
0   1    90484.162152
1   2    92672.971157
2   3    96548.937830
3   4    64562.723753
4   5    55493.144079
5   6    67424.772273
6   7    55538.510575
7   8    66803.790650
8   9    91852.603821
9  10    86568.258842


In [14]:
# Cell 14: Cleanup
gc.collect()
print("\n✓ Pipeline complete!")


✓ Pipeline complete!
