In [1]:
# 1) Imports & file listing
import os, gc, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
np.random.seed(42)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:20]:
        print(os.path.join(dirname, filename))

/kaggle/input/eee-g513/train.csv
/kaggle/input/eee-g513/test.csv
/kaggle/input/eee-g513/cost_of_living.csv


In [2]:
# 2) Load data
BASE = '/kaggle/input/eee-g513'
cands = []
for p, _, f in os.walk(BASE):
    fset = {x.lower() for x in f}
    if {'train.csv','test.csv'}.issubset(fset):
        cands.append(p)
DATA_PATH = cands[0] if cands else BASE

train = pd.read_csv(f'{DATA_PATH}/train.csv')
test  = pd.read_csv(f'{DATA_PATH}/test.csv')

# Optional COL
colv_path = os.path.join(DATA_PATH, 'cost_of_living.csv')
colv = pd.read_csv(colv_path) if os.path.exists(colv_path) else None

print(train.shape, test.shape, None if colv is None else colv.shape)
train.head(2)

(6525, 6) (2799, 5) (1528, 56)


Unnamed: 0,ID,country,state,city,role,salary_average
0,1,United States Of America,Colorado,Thornton,accountant,95110.0
1,2,United States Of America,Colorado,Thornton,automation-analyst,104766.0


In [3]:
# 3) Merge cost_of_living if available
if colv is not None:
    if 'city_id' in train.columns and 'city_id' in colv.columns:
        keys = ['city_id']
    else:
        keys = ['city','country']
    colv_uniq = colv.drop_duplicates(subset=keys).copy()
    train = train.merge(colv_uniq, on=keys, how='left', suffixes=('','_col'))
    test  = test.merge(colv_uniq,  on=keys, how='left', suffixes=('','_col'))

# Drop duplicate-named columns created by merges
if train.columns.duplicated().any():
    train = train.loc[:, ~train.columns.duplicated()]
if test.columns.duplicated().any():
    test = test.loc[:, ~test.columns.duplicated()]

In [4]:
# 4) Target & ID
target = 'salary_average'

# Detect/standardize ID for submission
possible_ids = ['ID','id','Id','row_id','RowId','rowID']
id_col = next((c for c in possible_ids if c in test.columns), None)
if id_col is None:
    # create an ID column from index to be safe
    test = test.reset_index(drop=False).rename(columns={'index':'ID'})
    id_col = 'ID'
else:
    # ensure it's named 'ID' in submission later
    pass

# Drop invalid targets; then align columns
before = len(train)
train = train[train[target].notna() & (train[target] > 0)].reset_index(drop=True)
print(f'Dropped {before - len(train)} rows with invalid targets.')

# Keep common features between train/test (exclude target and the detected id_col if present in train)
exclude = {target}
if id_col in train.columns:
    exclude.add(id_col)
common_feats = [c for c in train.columns if c in test.columns and c not in exclude]

train = pd.concat([train[common_feats], train[[target]]], axis=1)
test  = test[common_feats].copy()

Dropped 45 rows with invalid targets.


In [5]:
# 5) Dtypes & numeric cleanup
feat_cols = common_feats[:]

# Convert numeric-like objects to numeric
for c in feat_cols:
    if train[c].dtype == 'object':
        tr_num = pd.to_numeric(train[c], errors='coerce')
        te_num = pd.to_numeric(test[c],  errors='coerce')
        if tr_num.notna().mean() > 0.95 and te_num.notna().mean() > 0.95:
            train[c] = tr_num
            test[c]  = te_num

num_cols = [c for c in feat_cols if pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in feat_cols if c not in num_cols]

# Winsorize (clip 1/99%) to reduce outlier impact
for c in num_cols:
    q1, q99 = train[c].quantile([0.01, 0.99])
    train[c] = train[c].clip(q1, q99)
    test[c]  = test[c].clip(q1, q99)

# Log-transform skewed positive numeric features
skewed = []
for c in num_cols:
    if (train[c] > 0).all():
        if train[c].skew() > 1.0:
            train[c] = np.log1p(train[c])
            test[c]  = np.log1p(test[c])
            skewed.append(c)
print('Log-transformed numeric features:', len(skewed))

# Impute numeric with train median
for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c]  = test[c].fillna(med)

# Categorical casting (for LGBM); CatBoost will use strings later
for c in cat_cols:
    train[c] = train[c].astype('category')
    test[c]  = test[c].astype('category')


Log-transformed numeric features: 0


In [6]:
# 6) GroupKFold by city_id (or city), fallback to country
from sklearn.model_selection import GroupKFold

group_key = 'city_id' if 'city_id' in train.columns else ('city' if 'city' in train.columns else ('country' if 'country' in train.columns else None))
groups = train[group_key] if group_key is not None else pd.Series(['all']*len(train))
gkf = GroupKFold(n_splits=5)

def rmspe(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    mask = (y_true > eps) & np.isfinite(y_true) & np.isfinite(y_pred)
    return float(np.sqrt(np.mean(((y_true[mask]-y_pred[mask])/y_true[mask])**2)))


In [7]:
# 7) Target encodings without leakage
TE_COLS = ['te_country','te_role','te_country_role']

def add_te(X_tr, y_tr, X_tgt):
    Xt = X_tgt.copy()
    gmean = float(y_tr.mean())

    if 'country' in X_tr.columns and 'country' in Xt.columns:
        df = pd.DataFrame({'key': X_tr['country'].astype(str), 'y': y_tr.values})
        m = df.groupby('key')['y'].mean()
        Xt['te_country'] = Xt['country'].astype(str).map(m).astype('float64').fillna(gmean)

    if 'role' in X_tr.columns and 'role' in Xt.columns:
        df = pd.DataFrame({'key': X_tr['role'].astype(str), 'y': y_tr.values})
        m = df.groupby('key')['y'].mean()
        Xt['te_role'] = Xt['role'].astype(str).map(m).astype('float64').fillna(gmean)

    if {'country','role'}.issubset(X_tr.columns) and {'country','role'}.issubset(Xt.columns):
        df = pd.DataFrame({'country': X_tr['country'].astype(str), 'role': X_tr['role'].astype(str), 'y': y_tr.values})
        m = df.groupby(['country','role'])['y'].mean().to_dict()
        cr_keys = list(zip(Xt['country'].astype(str), Xt['role'].astype(str)))
        Xt['te_country_role'] = pd.Series([m.get(k, np.nan) for k in cr_keys], index=Xt.index).astype('float64').fillna(gmean)

    return Xt

y_true_oof = np.zeros(len(train))  # ground truth aligned to OOF slots


In [8]:
# 8) LightGBM (log-target)
import lightgbm as lgb

lgb_oof = np.zeros(len(train))
lgb_test = np.zeros(len(test))

lgb_params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'learning_rate': 0.035,
    'num_leaves': 31,
    'min_data_in_leaf': 100,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 1.0,
    'lambda_l2': 3.0,
    'max_depth': -1,
    'verbosity': -1,
    'seed': 42
}

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy().drop(columns=TE_COLS, errors='ignore')
    y_tr = train.iloc[tr_idx][target].copy()
    X_va = train.iloc[va_idx][feat_cols].copy().drop(columns=TE_COLS, errors='ignore')
    y_va = train.iloc[va_idx][target].copy()

    # add TEs from training split only
    X_tr = add_te(X_tr, y_tr, X_tr)
    X_va = add_te(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    cat_in_use = [c for c in X_tr.columns if not pd.api.types.is_numeric_dtype(X_tr[c])]
    dtr = lgb.Dataset(X_tr, label=np.log1p(y_tr), categorical_feature=cat_in_use, free_raw_data=True)
    dva = lgb.Dataset(X_va, label=np.log1p(y_va), categorical_feature=cat_in_use, free_raw_data=True)

    model = lgb.train(
        params=lgb_params,
        train_set=dtr,
        valid_sets=[dtr, dva],
        num_boost_round=5000,
        callbacks=[lgb.early_stopping(300, verbose=False), lgb.log_evaluation(200)]
    )

    va_pred = np.expm1(model.predict(X_va, num_iteration=model.best_iteration))
    lgb_oof[va_idx] = va_pred
    print(f'[LGBM] Fold {fold} RMSPE:', round(rmspe(y_va, va_pred), 4))

    lgb_test += np.expm1(model.predict(X_te, num_iteration=model.best_iteration)) / gkf.get_n_splits()

print('[LGBM] OOF RMSPE:', round(rmspe(train[target], lgb_oof), 4))


[200]	training's rmse: 0.0469595	valid_1's rmse: 0.0864487
[400]	training's rmse: 0.0338952	valid_1's rmse: 0.0877699
[LGBM] Fold 1 RMSPE: 0.0807
[200]	training's rmse: 0.0517905	valid_1's rmse: 0.109654
[400]	training's rmse: 0.0433411	valid_1's rmse: 0.107644
[600]	training's rmse: 0.0404475	valid_1's rmse: 0.106634
[800]	training's rmse: 0.0389424	valid_1's rmse: 0.105973
[1000]	training's rmse: 0.0380045	valid_1's rmse: 0.105511
[1200]	training's rmse: 0.0373178	valid_1's rmse: 0.105194
[1400]	training's rmse: 0.0368229	valid_1's rmse: 0.104947
[1600]	training's rmse: 0.0364291	valid_1's rmse: 0.104765
[1800]	training's rmse: 0.0360908	valid_1's rmse: 0.104655
[2000]	training's rmse: 0.0358222	valid_1's rmse: 0.104572
[2200]	training's rmse: 0.0355958	valid_1's rmse: 0.104498
[2400]	training's rmse: 0.0354025	valid_1's rmse: 0.104416
[2600]	training's rmse: 0.0352206	valid_1's rmse: 0.104387
[2800]	training's rmse: 0.0350581	valid_1's rmse: 0.104318
[3000]	training's rmse: 0.034923

In [9]:
# 9) CatBoost (log-target)
from catboost import CatBoostRegressor, Pool

cb_oof = np.zeros(len(train))
cb_test = np.zeros(len(test))

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, groups=groups), 1):
    X_tr = train.iloc[tr_idx][feat_cols].copy().drop(columns=TE_COLS, errors='ignore')
    y_tr = train.iloc[tr_idx][target].copy()
    X_va = train.iloc[va_idx][feat_cols].copy().drop(columns=TE_COLS, errors='ignore')
    y_va = train.iloc[va_idx][target].copy()

    # add TEs from training split only
    X_tr = add_te(X_tr, y_tr, X_tr)
    X_va = add_te(train.iloc[tr_idx][feat_cols], y_tr, X_va)
    X_te = add_te(train.iloc[tr_idx][feat_cols], y_tr, test[feat_cols].copy())

    # Ensure non-numerics are strings with no NaNs
    for df in (X_tr, X_va, X_te):
        for c in df.columns:
            if not pd.api.types.is_numeric_dtype(df[c]):
                df[c] = df[c].astype('object').where(df[c].notna(), 'Unknown').astype(str)

    cat_idx = [X_tr.columns.get_loc(c) for c in X_tr.columns if not pd.api.types.is_numeric_dtype(X_tr[c])]

    tr_pool = Pool(X_tr, np.log1p(y_tr), cat_features=cat_idx)
    va_pool = Pool(X_va, np.log1p(y_va), cat_features=cat_idx)
    te_pool = Pool(X_te, cat_features=cat_idx)

    model = CatBoostRegressor(
        loss_function='RMSE',
        learning_rate=0.04,
        depth=8,
        l2_leaf_reg=6.0,
        iterations=10000,
        random_seed=42,
        early_stopping_rounds=400,
        verbose=False
    )
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    va_pred = np.expm1(model.predict(va_pool))
    cb_oof[va_idx] = va_pred
    print(f'[CatBoost] Fold {fold} RMSPE:', round(rmspe(y_va, va_pred), 4))

    cb_test += np.expm1(model.predict(te_pool)) / gkf.get_n_splits()

print('[CatBoost] OOF RMSPE:', round(rmspe(train[target], cb_oof), 4))


[CatBoost] Fold 1 RMSPE: 0.0856
[CatBoost] Fold 2 RMSPE: 0.1202
[CatBoost] Fold 3 RMSPE: 0.8926
[CatBoost] Fold 4 RMSPE: 0.4278
[CatBoost] Fold 5 RMSPE: 0.2546
[CatBoost] OOF RMSPE: 0.4376


In [10]:
# 10) Blend and make submission (IDs 1..2790, WITH header)

# --- pick best blend weight on OOF (robust) ---
best_w, best_score = 0.5, float('inf')
y_true = y_true_oof if 'y_true_oof' in globals() else train[target].values[:min(len(lgb_oof), len(cb_oof))]
for w in np.linspace(0.2, 0.8, 13):
    oof = w*np.asarray(lgb_oof, float) + (1-w)*np.asarray(cb_oof, float)
    m = min(len(y_true), len(oof))
    s = rmspe(y_true[:m], oof[:m])
    if np.isfinite(s) and s < best_score:
        best_w, best_score = float(w), float(s)
print(f"[Blend] best w={best_w:.3f} | OOF RMSPE={best_score:.4f}")

# --- blended test predictions ---
test_pred = best_w*np.asarray(lgb_test, float) + (1-best_w)*np.asarray(cb_test, float)

# --- force evaluator row count = 2790 ---
N_EXPECTED = 2790
if len(test_pred) > N_EXPECTED:
    print(f"[WARN] test has {len(test_pred)} rows; trimming to {N_EXPECTED}.")
    test_pred = test_pred[:N_EXPECTED]
elif len(test_pred) < N_EXPECTED:
    raise ValueError(f"Predictions shorter ({len(test_pred)}) than {N_EXPECTED}.")

# --- IDs 1..2790 and proper header ---
ids = np.arange(1, N_EXPECTED + 1, dtype=int)
sub = pd.DataFrame({'ID': ids, 'salary_average': test_pred})

# hard checks
assert sub.shape == (N_EXPECTED, 2)
assert list(sub.columns) == ['ID', 'salary_average']

# SAVE WITH HEADER
sub.to_csv('submission.csv', index=False)
print("Saved submission.csv:", sub.shape)
sub.head()


[Blend] best w=0.500 | OOF RMSPE=inf
[WARN] test has 2799 rows; trimming to 2790.
Saved submission.csv: (2790, 2)


Unnamed: 0,ID,salary_average
0,1,86926.285934
1,2,91372.097738
2,3,93880.579379
3,4,70838.435748
4,5,59868.766719


In [11]:
# Re-read pristine test to get the authoritative row count & IDs
test_raw = pd.read_csv(f'{DATA_PATH}/test.csv')
print('Expected test rows:', len(test_raw))   # should match the error message (e.g., 2790)

# What did we submit?
print('Current test rows used:', len(test))   # from your current notebook state


Expected test rows: 2799
Current test rows used: 2799


In [12]:
!ls /kaggle/input

eee-g513
