In [None]:
# Cell 1: Imports & Setup
# Include necessary libraries for data handling, modeling, and feature engineering.
# Added clustering and PCA for new features.
import os
import gc
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import xgboost as xgb

warnings.filterwarnings("ignore")
np.random.seed(42)

In [None]:
# Cell 2: Load Data
# Load from the main path. Use the provided test.csv with 2799 rows.
# Do not trim predictions; submit for all 2799 rows to match the file.
# Analysis: Prompt specifies 2799 samples, so use full test without trimming.
DATA_PATH = "/kaggle/input/eee-g513"

train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
test_raw = pd.read_csv(f"{DATA_PATH}/test.csv")
colv_path = f"{DATA_PATH}/cost_of_living.csv"
colv = pd.read_csv(colv_path) if os.path.exists(colv_path) else None

print(train_raw.shape, test_raw.shape, None if colv is None else colv.shape)

In [None]:
# Cell 3: Merge Cost-of-Living Data
# Aggregate means for unique keys to handle any duplicates.
# Keys: Prefer city_id if available, else city+country.
# Analysis: COL has 1528 rows, potential multiples; aggregating ensures 1:1 merge.
if colv is not None:
    if 'city_id' in train_raw.columns and 'city_id' in colv.columns:
        keys = ['city_id']
    else:
        keys = ['city', 'country']

    colv_agg = colv.groupby(keys, as_index=False).mean(numeric_only=True)
    train = train_raw.merge(colv_agg, on=keys, how='left')
    test = test_raw.merge(colv_agg, on=keys, how='left')
else:
    train, test = train_raw.copy(), test_raw.copy()

train = train.loc[:, ~train.columns.duplicated()]
test = test.loc[:, ~test.columns.duplicated()]

In [None]:
# Cell 4: Basic Cleaning
# Define target, drop invalid rows, identify shared features.
# Analysis: Targets are positive salaries; drop NaN/<=0 to avoid RMSPE issues.
TARGET = 'salary_average'
train = train[train[TARGET].notna() & (train[TARGET] > 0)].reset_index(drop=True)

feat_cols = [c for c in train.columns if c in test.columns]
print(len(feat_cols), 'shared features')

In [None]:
# Cell 5: Data Types & Numeric Cleanup
# Convert objects to numeric if possible, winsorize outliers, log-transform skewed positives, impute medians.
# Analysis: Salaries and COL indicators are skewed; log helps normality. Winsorize prevents extreme influence.
for c in feat_cols:
    if train[c].dtype == 'object':
        tr_num = pd.to_numeric(train[c], errors='coerce')
        te_num = pd.to_numeric(test[c], errors='coerce')
        if tr_num.notna().mean() > 0.95 and te_num.notna().mean() > 0.95:
            train[c] = tr_num
            test[c] = te_num

num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

for c in num_cols:
    q1, q99 = train[c].quantile([0.01, 0.99])
    train[c] = train[c].clip(q1, q99)
    test[c] = test[c].clip(q1, q99)

for c in num_cols:
    if (train[c] > 0).all() and train[c].skew() > 1.0:
        train[c] = np.log1p(train[c])
        test[c] = np.log1p(test[c])

for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c] = test[c].fillna(med)

In [None]:
# Cell 6: Add Ratio Features
# Create meaningful ratios from COL indicators to capture relative costs.
# Analysis: Ratios normalize costs against purchasing power, helping generalize to unseen cities.
def add_ratios(df):
    cand = df.columns.str.lower()
    def get(name):
        idx = np.where(cand.str.contains(name))[0]
        return df.iloc[:, idx[0]] if len(idx) else None

    ppp = get('purchasing')
    rent = get('rent')
    groc = get('grocer') or get('grocery')
    trans = get('transport')
    rest = get('restaurant')
    health = get('health')
    util = get('utilit')

    if ppp is not None:
        for other in [rent, groc, trans, rest, health, util]:
            if other is not None:
                df[f'f_{other.name.split("_")[0]}_over_ppp'] = other / (ppp + 1e-6)

    if rent is not None and groc is not None:
        df['f_rent_over_groc'] = rent / (groc + 1e-6)
    if rent is not None and trans is not None:
        df['f_rent_over_trans'] = rent / (trans + 1e-6)

    return df

train = add_ratios(train)
test = add_ratios(test)

feat_cols = [c for c in train.columns if c in test.columns]
num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

In [None]:
# --------------------------------------------------------------
# Cell 7 – Add PCA & K-Means clusters (FIXED)
# --------------------------------------------------------------
if len(num_cols) > 10:                     # enough numeric COL columns
    # ---- robust-scale the numeric block ---------------------------------
    scaler_pca = RobustScaler()
    X_num_tr = scaler_pca.fit_transform(train[num_cols])
    X_num_te = scaler_pca.transform(test[num_cols])

    # ---- PCA (10 components) -------------------------------------------
    pca = PCA(n_components=10, random_state=42)
    pca_tr = pca.fit_transform(X_num_tr)
    pca_te = pca.transform(X_num_te)

    for i in range(pca_tr.shape[1]):
        train[f'pca_{i}'] = pca_tr[:, i]
        test[f'pca_{i}']  = pca_te[:, i]

    # ---- K-Means (20 city clusters) ------------------------------------
    kmeans = KMeans(n_clusters=20, random_state=42, n_init=10)
    clusters_tr = kmeans.fit_predict(X_num_tr)      # <-- numpy array
    clusters_te = kmeans.predict(X_num_te)          # <-- numpy array

    # ---- FIX: convert the *array* to a pandas Series first ----------
    train['city_cluster'] = pd.Series(clusters_tr, index=train.index).astype('category')
    test['city_cluster']  = pd.Series(clusters_te, index=test.index).astype('category')

In [None]:
# Cell 8: Smoothed Target Encoding Function
# Use higher prior for smoothing to handle sparse categories and unseen data.
# Analysis: Increased prior (20) smooths more, reducing overfit on rare country/role combos.
def kfold_target_encode(df_tr, y_tr, df_te, col, n_splits=5, prior=20.0, min_count=10, noise=0.01):
    gkf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    out = pd.Series(index=df_te.index, dtype='float64')
    global_mean = y_tr.mean()

    full_key = df_tr[col].astype(str)
    agg = pd.DataFrame({'key': full_key, 'y': y_tr}).groupby('key')['y'].agg(['sum', 'count'])
    enc_full = (agg['sum'] + prior * global_mean) / (agg['count'] + prior)

    out.loc[:] = df_te[col].astype(str).map(enc_full).fillna(global_mean).values
    if noise > 0:
        out += noise * np.random.randn(len(out))

    return out

In [None]:
# Cell 9: Add Target Encodings
# Encode more levels: country, state, role, interactions.
# Analysis: State adds granularity where available; clusters from Cell 7 also encoded.
def add_te_block(X_tr, y_tr, X_tgt):
    Xt = X_tgt.copy()
    gmean = y_tr.mean()

    for col in ['country', 'state', 'role', 'city_cluster']:
        if col in X_tr.columns and col in Xt.columns:
            Xt[f'te_{col}'] = kfold_target_encode(X_tr, y_tr, Xt, col)

    for pair in [('country', 'role'), ('country', 'state'), ('state', 'role')]:
        if set(pair).issubset(X_tr.columns):
            key_tr = X_tr[pair[0]].astype(str) + '_' + X_tr[pair[1]].astype(str)
            key_tgt = Xt[pair[0]].astype(str) + '_' + Xt[pair[1]].astype(str)
            Xt[f'te_{pair[0]}_{pair[1]}'] = kfold_target_encode(pd.DataFrame({'key': key_tr}), y_tr, pd.DataFrame({'key': key_tgt}), 'key')

    return Xt

In [None]:
# Cell 10: CV Setup & RMSPE Metric
# GroupKFold by city_id or city to prevent leakage across similar locations.
# Analysis: Groups ensure unseen cities are properly validated.
group_key = 'city_id' if 'city_id' in train.columns else ('city' if 'city' in train.columns else None)
groups = train[group_key] if group_key is not None else pd.Series(['all'] * len(train))
gkf = GroupKFold(n_splits=5)

def rmspe(y, yhat, eps=1e-6):
    y = np.asarray(y, float)
    yhat = np.asarray(yhat, float)
    mask = (y > eps) & np.isfinite(y) & np.isfinite(yhat)
    return np.sqrt(np.mean(((yhat[mask] - y[mask]) / y[mask]) ** 2))

y_true_oof = np.zeros(len(train))
lgb_oof = np.zeros(len(train))
cb_oof = np.zeros(len(train))
xgb_oof = np.zeros(len(train))
lgb_test = np.zeros(len(test))
cb_test = np.zeros(len(test))
xgb_test = np.zeros(len(test))

In [None]:
# Cell 11: LightGBM Model
# Tune params for deeper trees, more regularization to prevent overfit.
# Analysis: Lower LR, higher leaves for complexity; early stopping on 500.
lgb_params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'learning_rate': 0.03,
    'num_leaves': 63,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'lambda_l1': 2.0,
    'lambda_l2': 5.0,
    'max_depth': -1,
    'verbosity': -1,
    'seed': 42
}

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].copy()
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].copy()

    X_tr = add_te_block(X_tr, y_tr, X_tr)
    X_va = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    cat_in_use = [c for c in X_tr.columns if not pd.api.types.is_numeric_dtype(X_tr[c])]
    for c in cat_in_use:
        X_tr[c] = X_tr[c].astype('category')
        X_va[c] = X_va[c].astype('category')
        X_te[c] = X_te[c].astype('category')

    dtr = lgb.Dataset(X_tr, label=np.log1p(y_tr), categorical_feature=cat_in_use)
    dva = lgb.Dataset(X_va, label=np.log1p(y_va), categorical_feature=cat_in_use)

    model = lgb.train(lgb_params, dtr, valid_sets=[dtr, dva], num_boost_round=8000,
                      callbacks=[lgb.early_stopping(500, verbose=False)])

    va_pred = np.expm1(model.predict(X_va))
    te_pred = np.expm1(model.predict(X_te))

    y_true_oof[va_idx] = y_va.values
    lgb_oof[va_idx] = va_pred
    lgb_test += te_pred / 5

    print(f'[LGBM] Fold {fold} RMSPE: {rmspe(y_va, va_pred):.4f}')

print(f'[LGBM] OOF RMSPE: {rmspe(y_true_oof, lgb_oof):.4f}')

In [None]:
# Cell 12: CatBoost Model
# Increased iterations, adjusted reg for better generalization.
# Analysis: CatBoost handles cats natively; longer training with early stop.
for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].copy()
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].copy()

    X_tr = add_te_block(X_tr, y_tr, X_tr)
    X_va = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    for df in (X_tr, X_va, X_te):
        for c in df.columns:
            if not pd.api.types.is_numeric_dtype(df[c]):
                df[c] = df[c].astype('object').where(df[c].notna(), 'Unknown').astype(str)

    cat_idx = [X_tr.columns.get_loc(c) for c in X_tr.columns if not pd.api.types.is_numeric_dtype(X_tr[c])]

    tr_pool = Pool(X_tr, np.log1p(y_tr), cat_features=cat_idx)
    va_pool = Pool(X_va, np.log1p(y_va), cat_features=cat_idx)
    te_pool = Pool(X_te, cat_features=cat_idx)

    model = CatBoostRegressor(loss_function='RMSE', learning_rate=0.03, depth=9, l2_leaf_reg=8.0,
                              iterations=15000, random_seed=42, early_stopping_rounds=700, verbose=False)
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    va_pred = np.expm1(model.predict(va_pool))
    te_pred = np.expm1(model.predict(te_pool))

    cb_oof[va_idx] = va_pred
    cb_test += te_pred / 5

    print(f'[CatBoost] Fold {fold} RMSPE: {rmspe(y_va, va_pred):.4f}')

print(f'[CatBoost] OOF RMSPE: {rmspe(y_true_oof, cb_oof):.4f}')

In [None]:
# Cell 13: XGBoost Model
# Adjusted subsample and reg for diversity.
# Analysis: XGB adds gradient boosting variety; hist method for speed.
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.03,
    'max_depth': 9,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'reg_alpha': 1.0,
    'reg_lambda': 4.0,
    'n_estimators': 25000,
    'tree_method': 'hist',
    'random_state': 42
}

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].copy()
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].copy()

    X_tr = add_te_block(X_tr, y_tr, X_tr)
    X_va = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    for df in (X_tr, X_va, X_te):
        for c in df.columns:
            if not pd.api.types.is_numeric_dtype(df[c]):
                df[c] = pd.Categorical(df[c]).codes.astype('int32')

    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_tr, np.log1p(y_tr), eval_set=[(X_va, np.log1p(y_va))], verbose=False, early_stopping_rounds=700)

    va_pred = np.expm1(model.predict(X_va))
    te_pred = np.expm1(model.predict(X_te))

    xgb_oof[va_idx] = va_pred
    xgb_test += te_pred / 5

    print(f'[XGB] Fold {fold} RMSPE: {rmspe(y_va, va_pred):.4f}')

print(f'[XGB] OOF RMSPE: {rmspe(y_true_oof, xgb_oof):.4f}')

In [None]:
# Cell 14: Stacking Ensemble
# Meta-learner on OOF predictions; compare to weighted blend.
# Analysis: Stack captures model correlations; choose best to avoid overfit.
oof_stack = np.vstack([np.log1p(np.clip(lgb_oof, 0, None)),
                       np.log1p(np.clip(cb_oof, 0, None)),
                       np.log1p(np.clip(xgb_oof, 0, None))]).T
y_log = np.log1p(y_true_oof)

scaler = RobustScaler()
X_meta = scaler.fit_transform(oof_stack)

meta = Ridge(alpha=0.5, random_state=42, fit_intercept=True)
meta.fit(X_meta, y_log)

test_stack = np.vstack([np.log1p(np.clip(lgb_test, 0, None)),
                        np.log1p(np.clip(cb_test, 0, None)),
                        np.log1p(np.clip(xgb_test, 0, None))]).T
test_stack = scaler.transform(test_stack)
stack_pred = np.expm1(meta.predict(test_stack))

stack_oof = np.expm1(meta.predict(X_meta))
print(f'OOF RMSPE — Stack: {rmspe(y_true_oof, stack_oof):.5f}')

# Weighted blend search
best = (None, 9e9)
for a in np.linspace(0.0, 1.0, 11):
    for b in np.linspace(0.0, 1.0 - a, 11):
        c = 1.0 - a - b
        oof = a * lgb_oof + b * cb_oof + c * xgb_oof
        s = rmspe(y_true_oof, oof)
        if s < best[1]:
            best = ((a, b, c), s)

print(f'Best weights: {best[0]} | RMSPE: {best[1]:.5f}')

w = best[0]
blend_test = w[0] * lgb_test + w[1] * cb_test + w[2] * xgb_test

use_stack = rmspe(y_true_oof, stack_oof) <= best[1]
final_test_pred = stack_pred if use_stack else blend_test
print(f'Using {"Stack" if use_stack else "Blend"} | Final OOF RMSPE: {min(rmspe(y_true_oof, stack_oof), best[1]):.5f}')

In [None]:
# Cell 15: Generate Submission
# Use full test rows (2799), IDs from test_raw.
# Analysis: No trimming; match prompt's 2799 samples.
pred = np.asarray(final_test_pred, float)
id_col = next((c for c in ['ID', 'id', 'Id'] if c in test_raw.columns), None)
ids = test_raw[id_col].values if id_col else np.arange(1, len(test_raw) + 1)

sub = pd.DataFrame({'ID': ids, 'salary_average': pred})
sub.to_csv('submission.csv', index=False)
print('Submission shape:', sub.shape)
sub.head()

In [None]:
# Cell 16: Cleanup
# Free memory.
gc.collect();