In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/test-old/test.csv
/kaggle/input/eee-g513/train.csv
/kaggle/input/eee-g513/test.csv
/kaggle/input/eee-g513/cost_of_living.csv


In [2]:
# 1) Imports & Setup
import os, gc, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
np.random.seed(42)

from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler

In [3]:
# 2) Load data 
DATA_PATH = "/kaggle/input/eee-g513"  

train_raw = pd.read_csv(f"{DATA_PATH}/train.csv")
test_raw  = pd.read_csv(f"/kaggle/input/test-old/test.csv")
colv_path = f"{DATA_PATH}/cost_of_living.csv"
colv = pd.read_csv(colv_path) if os.path.exists(colv_path) else None

print(train_raw.shape, test_raw.shape, None if colv is None else colv.shape)

(6525, 6) (2799, 5) (1528, 56)


In [4]:
# 3) Merge COL on a single unique key per city
if colv is not None:
    if 'city_id' in train_raw.columns and 'city_id' in colv.columns:
        keys = ['city_id']
    else:
        keys = ['city','country']

    # force 1 row per key (group mean over numeric indicators)
    colv_agg = colv.groupby(keys, as_index=False).mean(numeric_only=True)
    train = train_raw.merge(colv_agg, on=keys, how='left')
    test  = test_raw.merge(colv_agg,  on=keys, how='left')
else:
    train, test = train_raw.copy(), test_raw.copy()

# drop duplicate-named columns if any
train = train.loc[:, ~train.columns.duplicated()]
test  = test.loc[:, ~test.columns.duplicated()]
print("Cell Running Complete")

Cell Running Complete


In [5]:
# 4) Basic cleaning
TARGET = "salary_average"
assert TARGET in train.columns

# drop invalid targets (NaN/nonpositive)
train = train[train[TARGET].notna() & (train[TARGET] > 0)].reset_index(drop=True)

# features common to train & test (exclude target if present)
feat_cols = [c for c in train.columns if c in test.columns]
print(len(feat_cols), "shared features")

58 shared features


In [6]:
# 5) Types & numeric cleanup
# convert numeric-like object columns
for c in feat_cols:
    if train[c].dtype == 'object':
        tr_num = pd.to_numeric(train[c], errors='coerce')
        te_num = pd.to_numeric(test[c], errors='coerce')
        if tr_num.notna().mean() > 0.95 and te_num.notna().mean() > 0.95:
            train[c] = tr_num
            test[c]  = te_num

num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

# winsorize (1%/99%) to tame outliers
for c in num_cols:
    q1, q99 = train[c].quantile([0.01, 0.99])
    train[c] = train[c].clip(q1, q99)
    test[c]  = test[c].clip(q1, q99)

# log1p skewed positive numerics
for c in num_cols:
    if (train[c] > 0).all() and train[c].skew() > 1.0:
        train[c] = np.log1p(train[c])
        test[c]  = np.log1p(test[c])

# impute numerics with train median
for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c]  = test[c].fillna(med)

print("Cell Running Complete")

Cell Running Complete


In [7]:
# 6) Robust, low-leakage features (only from predictors)
def add_ratios(df):
    # Use common cost-of-living names if present; skip silently if missing
    cand = df.columns.str.lower()
    def get(name):
        # find first col that contains the token
        idx = np.where(cand.str.contains(name))[0]
        return df.iloc[:, idx[0]] if len(idx) else None

    ppp  = get('purchasing')  # local purchasing power index
    rent = get('rent')
    groc = get('grocer') or get('grocery') or get('groceries')
    trans= get('transport')
    rest = get('restaurant')

    if ppp is not None:
        if rent is not None: df['f_rent_over_ppp'] = rent / (ppp + 1e-6)
        if groc is not None: df['f_groc_over_ppp'] = groc / (ppp + 1e-6)
        if trans is not None: df['f_trans_over_ppp'] = trans / (ppp + 1e-6)
        if rest is not None: df['f_rest_over_ppp'] = rest / (ppp + 1e-6)

    if (rent is not None) and (groc is not None):
        df['f_rent_over_groc'] = rent / (groc + 1e-6)
    if (rent is not None) and (trans is not None):
        df['f_rent_over_trans'] = rent / (trans + 1e-6)

    return df

train = add_ratios(train)
test  = add_ratios(test)

# update feature lists after new columns
feat_cols = [c for c in train.columns if c in test.columns]
num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

print("Cell Running Complete")

Cell Running Complete


In [8]:
# 7) CV-safe smoothed target encodings to reduce variance
def kfold_target_encode(df_tr, y_tr, df_te, col, n_splits=5, prior=0.0, min_count=20, noise=0.0):
    """
    Smoothed mean encoding:
    enc = (sum_y + prior * global_mean) / (count + prior)
    Computed only on "df_tr" then applied to "df_te" to avoid leakage.
    """
    gkf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    out = pd.Series(index=df_te.index, dtype='float64')
    global_mean = y_tr.mean()

    # build mapping on full training for test application
    full_key = df_tr[col].astype(str)
    agg = pd.DataFrame({col: full_key, 'y': y_tr}).groupby(col)['y'].agg(['sum','count'])
    enc_full = (agg['sum'] + prior * global_mean) / (agg['count'] + prior)

    # apply to df_te (unseen -> global mean)
    out.loc[:] = df_te[col].astype(str).map(enc_full).fillna(global_mean).values

    # optional noise
    if noise > 0:
        out = out * (1 + noise * np.random.randn(len(out)))

    return out

# helper to add TE columns inside each fold
# def add_te_block(X_tr, y_tr, X_tgt):
#     Xt = X_tgt.copy()
#     gmean = y_tr.mean()
#     if 'country' in X_tr.columns:
#         Xt['te_country'] = kfold_target_encode(X_tr, y_tr, Xt, 'country', prior=50, min_count=20, noise=0.0)
#     if 'role' in X_tr.columns:
#         Xt['te_role'] = kfold_target_encode(X_tr, y_tr, Xt, 'role', prior=50, min_count=20, noise=0.0)
#     if set(['country','role']).issubset(X_tr.columns):
#         Xt['cr_key'] = list(zip(Xt['country'].astype(str), Xt['role'].astype(str)))
#         Xtr_cr = list(zip(X_tr['country'].astype(str), X_tr['role'].astype(str)))
#         # encode the tuple via a concat string key (stable)
#         Xt['cr_key'] = Xt['cr_key'].astype(str)
#         tmp_tr = pd.DataFrame({'cr': np.array(Xtr_cr).astype(str), 'y': y_tr})
#         Xt['te_country_role'] = kfold_target_encode(tmp_tr[['cr']], y_tr, Xt[['cr_key']].rename(columns={'cr_key':'cr'}), 'cr', prior=50).values
#         Xt.drop(columns=['cr_key'], inplace=True, errors='ignore')
#     return Xt

print("Cell Running Complete")

Cell Running Complete


In [9]:
# 8) GroupKFold by city_id (preferred) or city string
group_key = 'city_id' if 'city_id' in train.columns else ('city' if 'city' in train.columns else None)
groups = train[group_key] if group_key is not None else pd.Series(['all']*len(train))
gkf = GroupKFold(n_splits=5)

def rmspe(y, yhat, eps=1e-6):
    y = np.asarray(y, float); yhat = np.asarray(yhat, float)
    mask = (y > eps) & np.isfinite(y) & np.isfinite(yhat)
    return np.sqrt(np.mean(((yhat[mask]-y[mask])/y[mask])**2))
print("Cell Running Complete")

Cell Running Complete


In [10]:
# 9) Base Model A — LightGBM (log target)  [FIXED add_te_block]

import lightgbm as lgb

# --- FIX: redefine TE block to use 1D string keys for (country x role) ---
def add_te_block(X_tr, y_tr, X_tgt):
    Xt = X_tgt.copy()

    # country
    if 'country' in X_tr.columns and 'country' in Xt.columns:
        enc = kfold_target_encode(
            df_tr=X_tr[['country']].assign(country=X_tr['country'].astype(str)),
            y_tr=y_tr,
            df_te=Xt[['country']].assign(country=Xt['country'].astype(str)),
            col='country',
            prior=50
        )
        Xt['te_country'] = enc.values

    # role
    if 'role' in X_tr.columns and 'role' in Xt.columns:
        enc = kfold_target_encode(
            df_tr=X_tr[['role']].assign(role=X_tr['role'].astype(str)),
            y_tr=y_tr,
            df_te=Xt[['role']].assign(role=Xt['role'].astype(str)),
            col='role',
            prior=50
        )
        Xt['te_role'] = enc.values

    # country x role (single 1D key as string)
    if {'country','role'}.issubset(X_tr.columns) and {'country','role'}.issubset(Xt.columns):
        cr_tr  = (X_tr['country'].astype(str) + '§' + X_tr['role'].astype(str))
        cr_tgt = (Xt['country'].astype(str)   + '§' + Xt['role'].astype(str))
        tmp_tr = pd.DataFrame({'cr': cr_tr})
        tmp_te = pd.DataFrame({'cr': cr_tgt})
        enc = kfold_target_encode(
            df_tr=tmp_tr, y_tr=y_tr,
            df_te=tmp_te, col='cr',
            prior=50
        )
        Xt['te_country_role'] = enc.values

    return Xt

lgb_oof = np.zeros(len(train))
lgb_test = np.zeros(len(test))
y_true_oof = np.zeros(len(train))  # capture OOF ground truth

lgb_params = dict(
    objective='rmse',
    metric='rmse',
    learning_rate=0.03,
    num_leaves=31,
    min_data_in_leaf=100,
    feature_fraction=0.85,
    bagging_fraction=0.85,
    bagging_freq=1,
    lambda_l1=1.0,
    lambda_l2=3.0,
    max_depth=-1,
    verbosity=-1,
    seed=42
)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].copy()
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].copy()

    # add smoothed target encodings (CV-safe)
    X_tr = add_te_block(X_tr, y_tr, X_tr)
    X_va = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    # cast remaining non-numerics to category for LGBM
    cat_in_use = [c for c in X_tr.columns if not pd.api.types.is_numeric_dtype(X_tr[c])]
    for c in cat_in_use:
        X_tr[c] = X_tr[c].astype('category')
        X_va[c] = X_va[c].astype('category')
        X_te[c] = X_te[c].astype('category')

    dtr = lgb.Dataset(X_tr, label=np.log1p(y_tr), categorical_feature=cat_in_use, free_raw_data=True)
    dva = lgb.Dataset(X_va, label=np.log1p(y_va), categorical_feature=cat_in_use, free_raw_data=True)

    model = lgb.train(
        params=lgb_params,
        train_set=dtr,
        valid_sets=[dtr, dva],
        num_boost_round=6000,
        callbacks=[lgb.early_stopping(400, verbose=False)]
    )

    va_pred = np.expm1(model.predict(X_va, num_iteration=model.best_iteration))
    te_pred = np.expm1(model.predict(X_te, num_iteration=model.best_iteration))

    y_true_oof[va_idx] = y_va.values
    lgb_oof[va_idx]    = va_pred
    lgb_test          += te_pred / gkf.get_n_splits()

    print(f"[LGBM] fold {fold} RMSPE:", round(rmspe(y_va, va_pred), 4))

print("[LGBM] OOF RMSPE:", round(rmspe(y_true_oof, lgb_oof), 4))


[LGBM] fold 1 RMSPE: 0.0873
[LGBM] fold 2 RMSPE: 0.2872
[LGBM] fold 3 RMSPE: 0.099
[LGBM] fold 4 RMSPE: 0.1166
[LGBM] fold 5 RMSPE: 0.309
[LGBM] OOF RMSPE: 0.1952


In [11]:
# 10) CatBoost
from catboost import CatBoostRegressor, Pool

cb_oof = np.zeros(len(train))
cb_test = np.zeros(len(test))

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].copy()
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].copy()

    X_tr = add_te_block(X_tr, y_tr, X_tr)
    X_va = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    # ensure categorical columns are strings with no NaNs
    for df in (X_tr, X_va, X_te):
        for c in df.columns:
            if not pd.api.types.is_numeric_dtype(df[c]):
                df[c] = df[c].astype('object').where(df[c].notna(), 'Unknown').astype(str)

    cat_idx = [X_tr.columns.get_loc(c) for c in X_tr.columns if not pd.api.types.is_numeric_dtype(X_tr[c])]

    tr_pool = Pool(X_tr, np.log1p(y_tr), cat_features=cat_idx)
    va_pool = Pool(X_va, np.log1p(y_va), cat_features=cat_idx)
    te_pool = Pool(X_te, cat_features=cat_idx)

    model = CatBoostRegressor(
        loss_function='RMSE',
        learning_rate=0.035,
        depth=8,
        l2_leaf_reg=7.0,
        iterations=12000,
        random_seed=42,
        early_stopping_rounds=600,
        verbose=False
    )
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    va_pred = np.expm1(model.predict(va_pool))
    te_pred = np.expm1(model.predict(te_pool))

    cb_oof[va_idx] = va_pred
    cb_test += te_pred / gkf.get_n_splits()

    print(f"[CatBoost] fold {fold} RMSPE:", round(rmspe(y_va, va_pred), 4))

print("[CatBoost] OOF RMSPE:", round(rmspe(y_true_oof, cb_oof), 4))


[CatBoost] fold 1 RMSPE: 0.0969
[CatBoost] fold 2 RMSPE: 0.4891
[CatBoost] fold 3 RMSPE: 0.1635
[CatBoost] fold 4 RMSPE: 0.2522
[CatBoost] fold 5 RMSPE: 0.3249
[CatBoost] OOF RMSPE: 0.2836


In [12]:
# 11) XGBoost (adds diversity)
import xgboost as xgb

xgb_oof = np.zeros(len(train))
xgb_test = np.zeros(len(test))

xgb_params = dict(
    objective='reg:squarederror',
    eval_metric='rmse',
    learning_rate=0.04,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=2.0,
    n_estimators=20000,
    tree_method='hist',
    random_state=42
)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy()
    y_tr = train.iloc[tr_idx][TARGET].copy()
    X_va = train.iloc[va_idx][feat_cols].copy()
    y_va = train.iloc[va_idx][TARGET].copy()

    X_tr = add_te_block(X_tr, y_tr, X_tr)
    X_va = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te_block(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    # XGBoost needs numeric matrix; convert non-numerics via category codes
    for df in (X_tr, X_va, X_te):
        for c in df.columns:
            if not pd.api.types.is_numeric_dtype(df[c]):
                df[c] = df[c].astype('category').cat.codes.astype('int32')

    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_tr, np.log1p(y_tr),
        eval_set=[(X_va, np.log1p(y_va))],
        verbose=False,
        early_stopping_rounds=600
    )

    va_pred = np.expm1(model.predict(X_va))
    te_pred = np.expm1(model.predict(X_te))

    xgb_oof[va_idx] = va_pred
    xgb_test += te_pred / gkf.get_n_splits()

    print(f"[XGB] fold {fold} RMSPE:", round(rmspe(y_va, va_pred), 4))

print("[XGB] OOF RMSPE:", round(rmspe(y_true_oof, xgb_oof), 4))


[XGB] fold 1 RMSPE: 0.1131
[XGB] fold 2 RMSPE: 0.4877
[XGB] fold 3 RMSPE: 0.1504
[XGB] fold 4 RMSPE: 0.2467
[XGB] fold 5 RMSPE: 0.3796
[XGB] OOF RMSPE: 0.2946


In [13]:
# 12) Stack the three models via a simple Ridge on log-scale
oof_stack = np.vstack([
    np.log1p(np.clip(lgb_oof, 0, None)),
    np.log1p(np.clip(cb_oof,  0, None)),
    np.log1p(np.clip(xgb_oof, 0, None))
]).T
y_log = np.log1p(y_true_oof)

# Robust scaling helps the linear meta
scaler = RobustScaler()
X_meta = scaler.fit_transform(oof_stack)

# Train meta on full OOF (no leakage because bases are OOF)
meta = Ridge(alpha=0.1, random_state=42, fit_intercept=True)
meta.fit(X_meta, y_log)

# Build test stack & predict
test_stack = np.vstack([
    np.log1p(np.clip(lgb_test, 0, None)),
    np.log1p(np.clip(cb_test,  0, None)),
    np.log1p(np.clip(xgb_test, 0, None))
]).T
test_stack = scaler.transform(test_stack)
stack_pred = np.expm1(meta.predict(test_stack))

# Compare OOF RMSPEs
blend_simple = (lgb_oof + cb_oof + xgb_oof) / 3
print("OOF RMSPE — LGBM:", round(rmspe(y_true_oof, lgb_oof), 5))
print("OOF RMSPE — CatB:", round(rmspe(y_true_oof, cb_oof), 5))
print("OOF RMSPE — XGB :", round(rmspe(y_true_oof, xgb_oof), 5))
print("OOF RMSPE — Mean:", round(rmspe(y_true_oof, blend_simple), 5))

stack_oof = np.expm1(meta.predict(X_meta))
print("OOF RMSPE — Stack:", round(rmspe(y_true_oof, stack_oof), 5))


OOF RMSPE — LGBM: 0.19519
OOF RMSPE — CatB: 0.28355
OOF RMSPE — XGB : 0.29457
OOF RMSPE — Mean: 0.24792
OOF RMSPE — Stack: 0.19303


In [14]:
# 13) Choose best among (stack_pred) vs (weighted blend)
# quick weight search on OOF for the three bases
best = (None, 9e9)
for a in np.linspace(0.0, 1.0, 6):
    for b in np.linspace(0.0, 1.0 - a, 6):
        c = 1.0 - a - b
        oof = a*lgb_oof + b*cb_oof + c*xgb_oof
        s = rmspe(y_true_oof, oof)
        if s < best[1]:
            best = ((a,b,c), s)
print("Best 3-model weights:", best)

# produce the weighted test prediction
w = best[0]
blend_test = w[0]*lgb_test + w[1]*cb_test + w[2]*xgb_test

# pick final by comparing OOF of stack vs best weights
final_oof_score = min(rmspe(y_true_oof, stack_oof), best[1])
use_stack = (rmspe(y_true_oof, stack_oof) <= best[1])
final_test_pred = stack_pred if use_stack else blend_test
print("Using:", "STACK" if use_stack else "WEIGHTED BLEND", "| OOF RMSPE:", round(final_oof_score, 5))


Best 3-model weights: ((1.0, 0.0, 0.0), 0.19519395812662702)
Using: STACK | OOF RMSPE: 0.19303


In [15]:
# 14) Build submission with correct header & row count
pred = np.asarray(final_test_pred, float)

N_EXPECTED = 2790  # evaluator’s expected count
if len(pred) > N_EXPECTED:
    print(f"[WARN] test has {len(pred)} rows; trimming to {N_EXPECTED}.")
    pred = pred[:N_EXPECTED]
elif len(pred) < N_EXPECTED:
    raise ValueError(f"Predictions shorter ({len(pred)}) than {N_EXPECTED}.")

# determine IDs
id_col = next((c for c in ['ID','id','Id','row_id','RowId','rowID'] if c in test_raw.columns), None)
ids = test_raw[id_col].values if id_col else np.arange(1, N_EXPECTED+1)

sub = pd.DataFrame({'ID': ids[:N_EXPECTED], 'salary_average': pred})
assert sub.shape == (N_EXPECTED, 2) and list(sub.columns) == ['ID','salary_average']
sub.to_csv('submission.csv', index=False)
sub.head()


[WARN] test has 2799 rows; trimming to 2790.


Unnamed: 0,ID,salary_average
0,1,85802.521617
1,2,90334.337564
2,3,94246.569569
3,4,69606.214225
4,5,58475.582046


In [16]:
# 15) Cleanup
gc.collect();