In [1]:
# Cell 1: Setup
import os, gc, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.linear_model import Ridge
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

warnings.filterwarnings("ignore")
np.random.seed(42)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:10]:
        print(os.path.join(dirname, filename))

/kaggle/input/eee-g513/train.csv
/kaggle/input/eee-g513/test.csv
/kaggle/input/eee-g513/cost_of_living.csv


In [2]:
# Cell 2: Load
DATA_PATH = "/kaggle/input/eee-g513"
train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
test_raw = pd.read_csv(f"{DATA_PATH}/test.csv")
colv = pd.read_csv(f"{DATA_PATH}/cost_of_living.csv")
print(f"Train: {train_raw.shape}, Test: {test_raw.shape}, COL: {colv.shape}")

Train: (6525, 6), Test: (2790, 5), COL: (1528, 56)


In [4]:
#Cell 3: Smart COL Merge
keys = ['city_id'] if 'city_id' in train_raw.columns else ['city', 'country']
colv_agg = colv.groupby(keys, as_index=False).median(numeric_only=True)
train = train_raw.merge(colv_agg, on=keys, how='left')
test = test_raw.merge(colv_agg, on=keys, how='left')
train = train.loc[:, ~train.columns.duplicated()]
test = test.loc[:, ~test.columns.duplicated()]

In [6]:
# Cell 4: Clean Target
TARGET = "salary_average"
train = train[train[TARGET].notna() & (train[TARGET] > 0)].reset_index(drop=True)
feat_cols = [c for c in train.columns if c in test.columns and c != TARGET]
print(f"Features: {len(feat_cols)}")

Features: 58


In [7]:
# Cell 5: Advanced Feature Engineering
def engineer_features(df):
    df = df.copy()
    cols_lower = {c.lower(): c for c in df.columns}
    
    def find(keywords):
        for kw in keywords:
            for lower, actual in cols_lower.items():
                if kw in lower:
                    return actual
        return None
    
    ppp = find(['purchasing', 'power'])
    rent = find(['rent'])
    groc = find(['grocer'])
    rest = find(['restaurant'])
    trans = find(['transport'])
    
    # Affordability metrics (KEY FEATURE)
    if ppp and pd.api.types.is_numeric_dtype(df[ppp]):
        df['ppp_log'] = np.log1p(df[ppp])
        if rent and pd.api.types.is_numeric_dtype(df[rent]):
            df['afford_rent'] = df[rent] / (df[ppp] + 1)
            df['rent_ppp_ratio'] = np.log1p(df[rent]) - np.log1p(df[ppp])
        if groc and pd.api.types.is_numeric_dtype(df[groc]):
            df['afford_food'] = df[groc] / (df[ppp] + 1)
        if trans and pd.api.types.is_numeric_dtype(df[trans]):
            df['afford_trans'] = df[trans] / (df[ppp] + 1)
    
    # Cost burden ratios
    if rent and groc:
        if pd.api.types.is_numeric_dtype(df[rent]) and pd.api.types.is_numeric_dtype(df[groc]):
            df['housing_food_ratio'] = df[rent] / (df[groc] + 1)
    
    # Composite COL (weighted by importance)
    cost_cols = []
    weights = []
    if rent and pd.api.types.is_numeric_dtype(df[rent]):
        cost_cols.append(rent)
        weights.append(0.4)  # Housing is biggest expense
    if groc and pd.api.types.is_numeric_dtype(df[groc]):
        cost_cols.append(groc)
        weights.append(0.25)
    if trans and pd.api.types.is_numeric_dtype(df[trans]):
        cost_cols.append(trans)
        weights.append(0.2)
    if rest and pd.api.types.is_numeric_dtype(df[rest]):
        cost_cols.append(rest)
        weights.append(0.15)
    
    if cost_cols:
        weights = np.array(weights) / sum(weights)
        df['weighted_col'] = sum(df[c] * w for c, w in zip(cost_cols, weights))
    
    return df

train = engineer_features(train)
test = engineer_features(test)
feat_cols = [c for c in train.columns if c in test.columns and c != TARGET]

In [8]:
# Cell 6: Preprocessing
for c in feat_cols:
    if train[c].dtype == 'object':
        tr_num = pd.to_numeric(train[c], errors='coerce')
        te_num = pd.to_numeric(test[c], errors='coerce')
        if tr_num.notna().mean() > 0.9:
            train[c] = tr_num
            test[c] = te_num

num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

# Winsorize outliers
for c in num_cols:
    q_low, q_high = train[c].quantile([0.005, 0.995])
    train[c] = train[c].clip(q_low, q_high)
    test[c] = test[c].clip(q_low, q_high)

# Log-transform skewed features
for c in num_cols:
    if (train[c] > 0).all() and train[c].skew() > 2.0:
        train[c] = np.log1p(train[c])
        test[c] = np.log1p(test[c])

# Impute
for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c] = test[c].fillna(med)

print(f"Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

Numeric: 54, Categorical: 4


In [9]:
# Cell 7: Robust Target Encoding
def target_encode_cv_safe(train_df, y_train, test_df, col, alpha=150):
    """Bayesian smoothed encoding - FIXED for proper test application"""
    global_mean = y_train.mean()
    
    # Build encoding map from training data
    stats = pd.DataFrame({
        'key': train_df[col].astype(str),
        'target': y_train
    }).groupby('key').agg(
        sum_y=('target', 'sum'),
        count=('target', 'count')
    )
    
    stats['encoded'] = (stats['sum_y'] + alpha * global_mean) / (stats['count'] + alpha)
    
    # Apply to test
    encoded = test_df[col].astype(str).map(stats['encoded']).fillna(global_mean)
    return encoded.values

def add_encodings(X_train, y_train, X_target):
    """Add all target encodings"""
    X_out = X_target.copy()
    
    if 'country' in X_train.columns:
        X_out['te_country'] = target_encode_cv_safe(X_train, y_train, X_out, 'country', alpha=150)
    
    if 'role' in X_train.columns:
        X_out['te_role'] = target_encode_cv_safe(X_train, y_train, X_out, 'role', alpha=100)
    
    if 'state' in X_train.columns:
        X_out['te_state'] = target_encode_cv_safe(X_train, y_train, X_out, 'state', alpha=80)
    
    if 'city' in X_train.columns:
        X_out['te_city'] = target_encode_cv_safe(X_train, y_train, X_out, 'city', alpha=50)
    
    # Interaction encodings
    if {'country', 'role'}.issubset(X_train.columns):
        X_train_temp = X_train.copy()
        X_out_temp = X_out.copy()
        X_train_temp['cr'] = X_train['country'].astype(str) + '|' + X_train['role'].astype(str)
        X_out_temp['cr'] = X_out['country'].astype(str) + '|' + X_out['role'].astype(str)
        X_out['te_country_role'] = target_encode_cv_safe(X_train_temp, y_train, X_out_temp, 'cr', alpha=200)
    
    return X_out


In [10]:
# Cell 8: CV Setup
group_key = 'city_id' if 'city_id' in train.columns else ('city' if 'city' in train.columns else 'country')
groups = train[group_key]
gkf = GroupKFold(n_splits=5)

def rmspe(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    mask = (y_true > eps) & np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        return np.inf
    return np.sqrt(np.mean(((y_true[mask] - y_pred[mask]) / y_true[mask]) ** 2))

print(f"GroupKFold on '{group_key}' - {gkf.n_splits} folds")


GroupKFold on 'city_id' - 5 folds


In [11]:
# Cell 9: LightGBM (Tuned)
lgb_oof = np.zeros(len(train))
lgb_test_preds = []  # Store each fold's test prediction
y_oof_true = np.zeros(len(train))

lgb_params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'learning_rate': 0.018,  # Even slower
    'num_leaves': 20,  # More conservative
    'min_data_in_leaf': 180,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'lambda_l1': 3.0,
    'lambda_l2': 7.0,
    'max_depth': 5,
    'min_gain_to_split': 0.02,
    'verbosity': -1,
    'seed': 42
}

print("\n=== LightGBM ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    # Encode within fold
    X_tr_enc = add_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    cat_features = [c for c in X_tr_enc.columns if not pd.api.types.is_numeric_dtype(X_tr_enc[c])]
    for c in cat_features:
        X_tr_enc[c] = X_tr_enc[c].astype('category')
        X_va_enc[c] = X_va_enc[c].astype('category')
        X_te_enc[c] = X_te_enc[c].astype('category')
    
    dtrain = lgb.Dataset(X_tr_enc, label=np.log1p(y_tr), categorical_feature=cat_features)
    dvalid = lgb.Dataset(X_va_enc, label=np.log1p(y_va), categorical_feature=cat_features)
    
    model = lgb.train(
        lgb_params,
        dtrain,
        valid_sets=[dvalid],
        num_boost_round=12000,
        callbacks=[lgb.early_stopping(600, verbose=False), lgb.log_evaluation(0)]
    )
    
    va_pred = np.expm1(model.predict(X_va_enc, num_iteration=model.best_iteration))
    te_pred = np.expm1(model.predict(X_te_enc, num_iteration=model.best_iteration))
    
    lgb_oof[va_idx] = va_pred
    lgb_test_preds.append(te_pred)
    y_oof_true[va_idx] = y_va
    
    print(f"Fold {fold}: {rmspe(y_va, va_pred):.4f} | Iter={model.best_iteration}")

lgb_test = np.mean(lgb_test_preds, axis=0)
print(f"LGBM OOF: {rmspe(y_oof_true, lgb_oof):.4f}")


=== LightGBM ===
Fold 1: 0.4251 | Iter=343
Fold 2: 1.3787 | Iter=776
Fold 3: 0.5674 | Iter=1020
Fold 4: 0.7425 | Iter=452
Fold 5: 0.5789 | Iter=651
LGBM OOF: 0.7795


In [12]:
# Cell 10: CatBoost (Tuned)
cb_oof = np.zeros(len(train))
cb_test_preds = []

print("\n=== CatBoost ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    X_tr_enc = add_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    cat_features = []
    for i, c in enumerate(X_tr_enc.columns):
        if not pd.api.types.is_numeric_dtype(X_tr_enc[c]):
            cat_features.append(i)
            for df in [X_tr_enc, X_va_enc, X_te_enc]:
                df[c] = df[c].astype(str).fillna('Unknown')
    
    train_pool = Pool(X_tr_enc, np.log1p(y_tr), cat_features=cat_features)
    valid_pool = Pool(X_va_enc, np.log1p(y_va), cat_features=cat_features)
    test_pool = Pool(X_te_enc, cat_features=cat_features)
    
    model = CatBoostRegressor(
        loss_function='RMSE',
        learning_rate=0.018,
        depth=4,  # Even shallower
        l2_leaf_reg=12.0,
        iterations=18000,
        early_stopping_rounds=800,
        random_seed=42,
        verbose=False
    )
    
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
    
    va_pred = np.expm1(model.predict(valid_pool))
    te_pred = np.expm1(model.predict(test_pool))
    
    cb_oof[va_idx] = va_pred
    cb_test_preds.append(te_pred)
    
    print(f"Fold {fold}: {rmspe(y_va, va_pred):.4f} | Iter={model.get_best_iteration()}")

cb_test = np.mean(cb_test_preds, axis=0)
print(f"CatBoost OOF: {rmspe(y_oof_true, cb_oof):.4f}")



=== CatBoost ===
Fold 1: 0.5468 | Iter=355
Fold 2: 1.4782 | Iter=746
Fold 3: 0.8695 | Iter=328
Fold 4: 0.8787 | Iter=402
Fold 5: 0.7210 | Iter=442
CatBoost OOF: 0.9177


In [13]:
# Cell 11: XGBoost (Tuned)
xgb_oof = np.zeros(len(train))
xgb_test_preds = []

print("\n=== XGBoost ===")
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].values
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].values
    
    X_tr_enc = add_encodings(X_tr, y_tr, X_tr)
    X_va_enc = add_encodings(X_tr, y_tr, X_va)
    X_te_enc = add_encodings(X_tr, y_tr, test[feat_cols].copy())
    
    for c in X_tr_enc.columns:
        if not pd.api.types.is_numeric_dtype(X_tr_enc[c]):
            for df in [X_tr_enc, X_va_enc, X_te_enc]:
                df[c] = df[c].astype('category').cat.codes
    
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        learning_rate=0.018,
        max_depth=4,
        subsample=0.65,
        colsample_bytree=0.65,
        reg_alpha=3.0,
        reg_lambda=7.0,
        n_estimators=18000,
        early_stopping_rounds=800,
        random_state=42,
        tree_method='hist'
    )
    
    model.fit(
        X_tr_enc, np.log1p(y_tr),
        eval_set=[(X_va_enc, np.log1p(y_va))],
        verbose=False
    )
    
    va_pred = np.expm1(model.predict(X_va_enc))
    te_pred = np.expm1(model.predict(X_te_enc))
    
    xgb_oof[va_idx] = va_pred
    xgb_test_preds.append(te_pred)
    
    print(f"Fold {fold}: {rmspe(y_va, va_pred):.4f}")

xgb_test = np.mean(xgb_test_preds, axis=0)
print(f"XGBoost OOF: {rmspe(y_oof_true, xgb_oof):.4f}")


=== XGBoost ===
Fold 1: 0.5312
Fold 2: 1.4630
Fold 3: 0.7749
Fold 4: 0.8914
Fold 5: 0.7478
XGBoost OOF: 0.9014


In [14]:
# Cell 12: Optimized Stacking with Quantile Transform
print("\n=== Ensemble ===")

# Use quantile transform for better meta-feature distribution
qt = QuantileTransformer(output_distribution='normal', random_state=42)

oof_meta = np.column_stack([
    lgb_oof,
    cb_oof,
    xgb_oof,
    (lgb_oof + cb_oof) / 2,  # Add pairwise blends as features
    (lgb_oof + xgb_oof) / 2,
    (cb_oof + xgb_oof) / 2
])

test_meta = np.column_stack([
    lgb_test,
    cb_test,
    xgb_test,
    (lgb_test + cb_test) / 2,
    (lgb_test + xgb_test) / 2,
    (cb_test + xgb_test) / 2
])

# Transform to normal distribution
oof_meta_qt = qt.fit_transform(oof_meta)
test_meta_qt = qt.transform(test_meta)

# Train meta-model with higher regularization
meta = Ridge(alpha=5.0, random_state=42)
meta.fit(oof_meta_qt, y_oof_true)

stacked_oof = meta.predict(oof_meta_qt)
stacked_test = meta.predict(test_meta_qt)

# Compare approaches
print(f"\nLGBM:         {rmspe(y_oof_true, lgb_oof):.4f}")
print(f"CatBoost:     {rmspe(y_oof_true, cb_oof):.4f}")
print(f"XGBoost:      {rmspe(y_oof_true, xgb_oof):.4f}")

simple_blend = (lgb_oof + cb_oof + xgb_oof) / 3
print(f"Simple Blend: {rmspe(y_oof_true, simple_blend):.4f}")
print(f"Stacked:      {rmspe(y_oof_true, stacked_oof):.4f}")

# Weighted blend optimization
best_score = float('inf')
best_weights = None
for w1 in np.linspace(0.2, 0.5, 7):
    for w2 in np.linspace(0.2, 0.5, 7):
        w3 = 1 - w1 - w2
        if w3 < 0.15 or w3 > 0.6:
            continue
        blend = w1*lgb_oof + w2*cb_oof + w3*xgb_oof
        score = rmspe(y_oof_true, blend)
        if score < best_score:
            best_score = score
            best_weights = (w1, w2, w3)

opt_blend_oof = best_weights[0]*lgb_oof + best_weights[1]*cb_oof + best_weights[2]*xgb_oof
opt_blend_test = best_weights[0]*lgb_test + best_weights[1]*cb_test + best_weights[2]*xgb_test
print(f"Opt Blend:    {rmspe(y_oof_true, opt_blend_oof):.4f} | Weights: {best_weights}")

# Select best
scores = {
    'stacked': rmspe(y_oof_true, stacked_oof),
    'opt_blend': rmspe(y_oof_true, opt_blend_oof),
    'simple': rmspe(y_oof_true, simple_blend)
}
best_method = min(scores, key=scores.get)
final_pred = {
    'stacked': stacked_test,
    'opt_blend': opt_blend_test,
    'simple': (lgb_test + cb_test + xgb_test) / 3
}[best_method]

print(f"\n>>> Using: {best_method.upper()} | OOF={scores[best_method]:.4f} <<<")


=== Ensemble ===

LGBM:         0.7795
CatBoost:     0.9177
XGBoost:      0.9014
Simple Blend: 0.8607
Stacked:      0.7162
Opt Blend:    0.8382 | Weights: (0.5, 0.2, 0.3)

>>> Using: STACKED | OOF=0.7162 <<<


In [15]:
# Cell 13: Submission
id_col = next((c for c in ['ID', 'id', 'Id'] if c in test_raw.columns), None)
test_ids = test_raw[id_col].values if id_col else np.arange(1, len(test_raw) + 1)

submission = pd.DataFrame({
    'ID': test_ids,
    'salary_average': final_pred
})

assert submission.shape[0] == len(test_raw)
assert list(submission.columns) == ['ID', 'salary_average']

submission.to_csv('submission.csv', index=False)
print(f"\n✓ Saved: {submission.shape}")
print(submission.head(10))

gc.collect()


✓ Saved: (2790, 2)
   ID  salary_average
0   1    83045.153377
1   2    89293.726228
2   3    93244.745040
3   4    64899.165047
4   5    53847.348584
5   6    70091.773746
6   7    53313.496824
7   8    64782.170392
8   9    85264.897535
9  10    83715.442440


33