In [27]:
import os, re, json, argparse, warnings, random
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional
from collections import Counter

import numpy as np
import pandas as pd

from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MaxAbsScaler, normalize
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_absolute_error


In [28]:

# optional libs
HAS_LGBM = True
try:
    import lightgbm as lgb
except Exception:
    HAS_LGBM = False

HAS_TRANSFORMERS = True
try:
    import torch
    from transformers import AutoTokenizer, AutoModel
except Exception:
    HAS_TRANSFORMERS = False
    warnings.warn("transformers/torch not found, BERT-Embedding base will be skipped.")

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt



In [29]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); os.environ["PYTHONHASHSEED"] = str(SEED)

ID_COL   = "ID"
TARGET_B = "Estimate Bottom"
TARGET_U = "Estimate Up"
TEXT_COLS = [
    "Current Position","Targeted Position","Candidate Level","Domisili",
    "Education 1","Education 2","Education 3","Notice Period","Tech Stack","Certification"
]


In [30]:
# ----------------- utilities & domain features -----------------
def safe_str(x) -> str:
    if pd.isna(x): return ""
    return re.sub(r"\s+"," ",str(x)).strip()

def parse_notice_days(s) -> float:
    if pd.isna(s): return np.nan
    s = str(s).lower().strip()
    if s in {"asap","join asap","can join asap","immediate","immediately"}: return 0.0
    m = re.search(r"(\d+(?:[.,]\d+)?)", s)
    val = float(m.group(1).replace(",", ".")) if m else np.nan
    if "day" in s or "hari" in s: mult=1.0
    elif "week" in s or "minggu" in s or "wk" in s: mult=7.0
    elif "month" in s or "bulan" in s or "bln" in s or "mo" in s: mult=30.0
    else: mult=1.0
    return val*mult if not np.isnan(val) else np.nan

def candidate_level_ord(s: str) -> int:
    t = safe_str(s).lower()
    if not t: return -1
    if re.search(r"\b(intern|trainee|fresher|fresh)\b", t): return 0
    if re.search(r"\b(junior|jr\.?)\b", t): return 1
    if re.search(r"\b(mid|middle|mid[-\s]?level)\b", t): return 2
    if re.search(r"\b(senior|sr\.?)\b", t): return 3
    if re.search(r"\b(lead|principal|staff|manager|head|director)\b", t): return 4
    return -1

def edu_ord_from_text(s: str) -> int:
    t = safe_str(s).lower()
    if not t: return -1
    if re.search(r"\b(s3|phd|doctor|doktor)\b", t): return 4
    if re.search(r"\b(s2|master|magister|m\.?sc|m\.?eng|m\.?kom|m\.?ba)\b", t): return 3
    if re.search(r"\b(s1|sarjana|bachelor|b\.?sc|b\.?eng|b\.?kom)\b", t): return 2
    if re.search(r"\b(d3|diploma)\b", t): return 1
    if re.search(r"\b(sma|smk|high\s?school)\b", t): return 0
    return -1

def highest_edu_ord(row) -> int:
    lvls = [edu_ord_from_text(row.get(c, "")) for c in ["Education 1","Education 2","Education 3"]]
    lvls = [x for x in lvls if x >= 0]
    return max(lvls) if lvls else -1

DELIM = re.compile(r"[;,/|&+]+|\band\b", re.I)
ALNUM = re.compile(r"[^\w#+\.]+")
def tokenize_skills(s: str) -> List[str]:
    s = safe_str(s).lower()
    if not s or s in {"none","-"}: return []
    parts = DELIM.split(s)
    toks = []
    for p in parts:
        p = ALNUM.sub(" ", p).strip()
        if not p: continue
        p = re.sub(r"\d+(\.\d+)?", "", p)
        p = re.sub(r"\s+", " ", p).strip()
        if p: toks.append(p)
    norm=[]
    for t in toks:
        norm.append("html/css" if t in {"html","css","html css"} else t)
    return norm

def top_k_skills(series: pd.Series, k=50) -> List[str]:
    cnt = Counter()
    for s in series.fillna(""):
        cnt.update(tokenize_skills(s))
    for bad in ["and","basic","expert","beginner","intermediate","none"]:
        cnt.pop(bad, None)
    return [w for w,_ in cnt.most_common(k)]

def normalize_city(s: str) -> str:
    t = safe_str(s).lower()
    if not t: return ""
    t = t.split(",")[0]
    t = re.sub(r"\b(kota|kab\.?|provinsi|kecamatan|dki)\b", "", t)
    t = re.sub(r"[^a-z\s\-]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def top_k_cities(series: pd.Series, k=20) -> List[str]:
    cnt = Counter(normalize_city(s) for s in series.fillna(""))
    cnt.pop("", None)
    return [c for c,_ in cnt.most_common(k)]

def position_similarity(cur: pd.Series, tgt: pd.Series) -> np.ndarray:
    vec = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=1, lowercase=True)
    A = vec.fit_transform(cur.map(safe_str)); B = vec.transform(tgt.map(safe_str))
    A = normalize(A); B = normalize(B)
    return (A.multiply(B)).sum(axis=1).A1

def build_numeric_features(df: pd.DataFrame, ctx: Dict) -> pd.DataFrame:
    n = len(df); num = pd.DataFrame(index=df.index)
    num["total_work_exp"] = pd.to_numeric(df.get("Total Working Experience", 0), errors="coerce")
    num["log1p_total_work_exp"] = np.log1p(np.clip(num["total_work_exp"], 0, None))
    num["exp_sq"] = np.square(num["total_work_exp"])
    if "Expected Benefit Botom" in df.columns:
        num["expected_bottom"] = pd.to_numeric(df["Expected Benefit Botom"], errors="coerce")
    if "Expected Benefit Up" in df.columns:
        num["expected_up"] = pd.to_numeric(df["Expected Benefit Up"], errors="coerce")
    num["expected_mid"] = (num.get("expected_bottom",0)+num.get("expected_up",0))/2.0
    num["expected_width"] = num.get("expected_up",0)-num.get("expected_bottom",0)
    with np.errstate(divide="ignore", invalid="ignore"):
        num["expected_rel_width"] = (num["expected_width"]/num["expected_mid"].replace(0,np.nan)).fillna(0)
    if "Notice Period" in df.columns:
        num["notice_days"] = df["Notice Period"].map(parse_notice_days)

    for col in ["Tech Stack","Certification","Current Position","Targeted Position"]:
        s = df.get(col, pd.Series([""]*n)).map(safe_str)
        key = col.lower().replace(" ","_")
        num[f"{key}_len"] = s.map(len)
        if col in ["Tech Stack","Certification"]:
            if col=="Tech Stack":
                num[f"{key}_items"] = s.map(lambda x: len(tokenize_skills(x)))
            else:
                num[f"{key}_items"] = s.map(lambda x: 0 if x=="" else len([t for t in re.split(r"[;,/|]+", x) if t.strip()]))

    num["candidate_level_ord"] = df.get("Candidate Level", pd.Series([""]*n)).map(candidate_level_ord)
    num["edu_ord"] = df.apply(highest_edu_ord, axis=1)

    text_combo = (
        df.get("Current Position", pd.Series([""]*n)).map(safe_str) + " | " +
        df.get("Targeted Position", pd.Series([""]*n)).map(safe_str) + " | " +
        df.get("Candidate Level", pd.Series([""]*n)).map(safe_str)
    ).str.lower()
    num["is_managerial"] = text_combo.str.contains(r"\b(?:head|lead|manager|director)\b", regex=True, na=False).astype(int)

    city = df.get("Domisili", pd.Series([""]*n)).map(normalize_city)
    if ctx and "top_cities" in ctx:
        for c in ctx["top_cities"]:
            num[f"city_{c}"] = (city == c).astype(int)
    num["is_high_umr_city"] = city.isin(ctx.get("high_umr_set", set())).astype(int)

    num["has_targeted_position"] = df.get("Targeted Position", pd.Series(index=df.index)).notna().astype(int)

    if ctx and "top_skills" in ctx:
        ts = df.get("Tech Stack", pd.Series([""]*n)).map(tokenize_skills)
        top_set = ctx["top_skills"]
        num["has_top50_skill"] = ts.map(lambda lst: int(any(t in top_set for t in lst)))
    else:
        num["has_top50_skill"] = 0

    num = num.apply(pd.to_numeric, errors="coerce")
    for c in num.columns:
        if num[c].isna().any(): num[c] = num[c].fillna(num[c].median())
    return num

def join_text_fields(df: pd.DataFrame, cols: List[str]) -> pd.Series:
    pieces = [df.get(c, pd.Series([""]*len(df))).map(safe_str) for c in cols]
    out = pieces[0]
    for p in pieces[1:]: out = out + " | " + p
    return out

def position_similarity_vec(df: pd.DataFrame) -> np.ndarray:
    return position_similarity(
        df.get("Current Position", pd.Series([""]*len(df))),
        df.get("Targeted Position", pd.Series([""]*len(df)))
    )

def compute_umr_multiplier(df: pd.DataFrame, ctx: dict) -> np.ndarray:
    n = len(df); mult = np.ones(n, dtype=float)
    if not ctx or "high_umr_set" not in ctx or ctx.get("umr_boost_pct",0) <= 0: return mult
    base = float(ctx["umr_boost_pct"]); base = min(0.07, max(0.0, base))
    city = df.get("Domisili", pd.Series([""]*n)).map(normalize_city)
    mask = city.isin(ctx["high_umr_set"]).values
    exp = pd.to_numeric(df.get("Total Working Experience", 0), errors="coerce").fillna(0).clip(0, 30).values
    exp_scale = np.minimum(1.0, exp / 5.0)
    lvl_txt = df.get("Candidate Level", pd.Series([""]*n)).astype(str).tolist()
    lvl = np.array([candidate_level_ord(s) for s in lvl_txt])
    lvl_scale_map = {-1:0.6, 0:0.5, 1:0.6, 2:1.0, 3:1.15, 4:1.25}
    lvl_scale = np.vectorize(lambda x: lvl_scale_map.get(x, 1.0))(lvl)
    boost = base * exp_scale * lvl_scale
    mult[mask] = 1.0 + boost[mask]
    return mult


In [31]:
# ----------------- text featurizer (TF-IDF/SVD) -----------------
@dataclass
class TextFeaturizer:
    word: TfidfVectorizer
    ch: TfidfVectorizer
    svd: Optional[TruncatedSVD] = None

def fit_text(train_text: pd.Series, max_w=100_000, max_c=80_000, svd_dim: int = 0) -> Tuple[TextFeaturizer, Tuple]:
    tfw = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=2, max_features=max_w, lowercase=True)
    tfc = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), min_df=2, max_features=max_c, lowercase=True)
    Xw = tfw.fit_transform(train_text); Xc = tfc.fit_transform(train_text)
    if svd_dim and svd_dim>0:
        svd = TruncatedSVD(n_components=svd_dim, random_state=SEED)
        Xs = svd.fit_transform(hstack([Xw,Xc]))
        return TextFeaturizer(tfw,tfc,svd),(Xw,Xc,Xs)
    return TextFeaturizer(tfw,tfc,None),(Xw,Xc,None)

def transform_text(tf: TextFeaturizer, text: pd.Series) -> Tuple:
    Xw = tf.word.transform(text); Xc = tf.ch.transform(text)
    if tf.svd is not None:
        Xs = tf.svd.transform(hstack([Xw,Xc])); return Xw,Xc,Xs
    return Xw,Xc,None


In [32]:
# ----------------- CV split -----------------
def stratified_kfold_by_y(y_mid: np.ndarray, n_splits=5, seed=SEED):
    y_mid = np.asarray(y_mid).reshape(-1)
    q = min(10, max(2, len(y_mid)//20 or 2))
    try:
        cats = pd.qcut(pd.Series(y_mid), q=q, duplicates="drop", labels=False)
        bins = cats.fillna(0).astype(int).values
    except Exception:
        ranks = pd.Series(y_mid).rank(method="average", pct=True)
        bins = np.floor(ranks * q).clip(0, q-1).astype(int).values
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    return skf.split(np.arange(len(y_mid)), bins)


In [33]:
# ----------------- results container -----------------
@dataclass
class Result:
    name: str
    oof: np.ndarray  # (n,2)
    mae: float
    details: dict
    test_pred: Optional[np.ndarray] = None  # filled later


In [34]:
# ----------------- base strategies -----------------
def build_all_features(df: pd.DataFrame, ctx: Dict, tf_word_char: Optional[TextFeaturizer]=None,
                       svd_dim: int = 0, fit_text_on_df: bool = False):
    joined = join_text_fields(df, TEXT_COLS)
    if fit_text_on_df: tf,(Xw,Xc,Xs) = fit_text(joined, svd_dim=svd_dim)
    else:              tf,(Xw,Xc,Xs) = (tf_word_char, transform_text(tf_word_char, joined))
    Xn = build_numeric_features(df, ctx); Xn = Xn.copy(); Xn["pos_sim"] = position_similarity_vec(df)
    return tf, Xw, Xc, Xs, Xn

def ridge_two_stage_cv(train_df: pd.DataFrame, y_bu: np.ndarray, ctx: Dict, folds=5) -> Result:
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid = (y_b+y_u)/2.0; y_w = (y_u-y_b)
    oof_mid = np.zeros(len(train_df)); oof_w = np.zeros(len(train_df))
    for tr,va in stratified_kfold_by_y(y_mid, n_splits=folds):
        tr_df = train_df.iloc[tr]; va_df = train_df.iloc[va]
        tf, Xw_tr, Xc_tr, _, Xn_tr = build_all_features(tr_df, ctx, fit_text_on_df=True)
        _,  Xw_va, Xc_va, _, Xn_va = build_all_features(va_df, ctx, tf_word_char=tf)
        scaler = MaxAbsScaler()
        Xn_tr_s = scaler.fit_transform(Xn_tr.values.astype(float)); Xn_va_s = scaler.transform(Xn_va.values.astype(float))
        X_tr = hstack([csr_matrix(Xn_tr_s), Xw_tr, Xc_tr]).tocsr()
        X_va = hstack([csr_matrix(Xn_va_s), Xw_va, Xc_va]).tocsr()
        alphas=[0.1,0.3,0.5,1,1.5,2,3,5,7.5,10,15,25,50]
        def fit_predict(target, idx_tr, idx_va):
            r1=RidgeCV(alphas=alphas).fit(X_tr, target[idx_tr]); p1=r1.predict(X_va)
            tlog=np.log1p(np.maximum(target,0)); r2=RidgeCV(alphas=alphas).fit(X_tr, tlog[idx_tr])
            p2=np.expm1(np.clip(r2.predict(X_va), None, 20))
            return p2 if mean_absolute_error(target[idx_va], p2) < mean_absolute_error(target[idx_va], p1) else p1
        oof_mid[va] = fit_predict(y_mid, tr, va); oof_w[va] = np.clip(fit_predict(y_w, tr, va), 0, None)
    pred_b = np.maximum(0, oof_mid - oof_w/2.0); pred_u = np.maximum(0, oof_mid + oof_w/2.0)
    sw = pred_b > pred_u
    if sw.any():
        tb,tu = pred_b.copy(), pred_u.copy()
        pred_b[sw]=np.minimum(tb[sw],tu[sw]); pred_u[sw]=np.maximum(tb[sw],tu[sw])
    mb = mean_absolute_error(y_b, pred_b); mu = mean_absolute_error(y_u, pred_u)
    return Result("Ridge-2Stage(TFIDF+num+domain)", np.column_stack([pred_b,pred_u]), float((mb+mu)/2),
                  {"mae_bottom":float(mb),"mae_up":float(mu)})

def hgb_two_stage_cv(train_df: pd.DataFrame, y_bu: np.ndarray, ctx: Dict, folds=5, svd_dim=500) -> Result:
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid = (y_b+y_u)/2.0; y_w = (y_u-y_b)
    oof_mid = np.zeros(len(train_df)); oof_w = np.zeros(len(train_df))
    for tr,va in stratified_kfold_by_y(y_mid, n_splits=folds):
        tr_df = train_df.iloc[tr]; va_df = train_df.iloc[va]
        tf, _, _, Xs_tr, Xn_tr = build_all_features(tr_df, ctx, fit_text_on_df=True, svd_dim=svd_dim)
        _,  _, _, Xs_va, Xn_va = build_all_features(va_df, ctx, tf_word_char=tf, svd_dim=svd_dim)
        X_tr = np.concatenate([Xs_tr, Xn_tr.values], axis=1); X_va = np.concatenate([Xs_va, Xn_va.values], axis=1)
        def fit_predict(target, idx_tr, idx_va):
            m1=HistGradientBoostingRegressor(random_state=SEED).fit(X_tr, target[idx_tr]); p1=m1.predict(X_va)
            tlog=np.log1p(np.maximum(target,0)); m2=HistGradientBoostingRegressor(random_state=SEED).fit(X_tr, tlog[idx_tr])
            p2=np.expm1(np.clip(m2.predict(X_va), None, 20))
            return p2 if mean_absolute_error(target[idx_va], p2) < mean_absolute_error(target[idx_va], p1) else p1
        oof_mid[va]=fit_predict(y_mid,tr,va); oof_w[va]=np.clip(fit_predict(y_w,tr,va),0,None)
    pred_b = np.maximum(0, oof_mid - oof_w/2.0); pred_u = np.maximum(0, oof_mid + oof_w/2.0)
    sw = pred_b > pred_u
    if sw.any():
        tb,tu = pred_b.copy(), pred_u.copy()
        pred_b[sw]=np.minimum(tb[sw],tu[sw]); pred_u[sw]=np.maximum(tb[sw],tu[sw])
    mb = mean_absolute_error(y_b, pred_b); mu = mean_absolute_error(y_u, pred_u)
    return Result("HGB-2Stage(SVD+num+domain)", np.column_stack([pred_b,pred_u]), float((mb+mu)/2),
                  {"mae_bottom":float(mb),"mae_up":float(mu)})

def lgbm_two_stage_cv(train_df: pd.DataFrame, y_bu: np.ndarray, ctx: Dict, folds=5, svd_dim=500) -> Result:
    assert HAS_LGBM, "lightgbm not available"
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid = (y_b+y_u)/2.0; y_w = (y_u-y_b)
    oof_mid = np.zeros(len(train_df)); oof_w = np.zeros(len(train_df))
    params=dict(objective="regression", learning_rate=0.05, n_estimators=800,
                num_leaves=63, subsample=0.8, colsample_bytree=0.8,
                reg_alpha=0.1, reg_lambda=1.0, random_state=SEED, verbose=-1)
    for tr,va in stratified_kfold_by_y(y_mid, n_splits=folds):
        tr_df = train_df.iloc[tr]; va_df = train_df.iloc[va]
        tf, _, _, Xs_tr, Xn_tr = build_all_features(tr_df, ctx, fit_text_on_df=True, svd_dim=svd_dim)
        _,  _, _, Xs_va, Xn_va = build_all_features(va_df, ctx, tf_word_char=tf, svd_dim=svd_dim)
        X_tr = np.concatenate([Xs_tr, Xn_tr.values], axis=1); X_va = np.concatenate([Xs_va, Xn_va.values], axis=1)
        def fit_predict(target, idx_tr, idx_va):
            m1=lgb.LGBMRegressor(**params).fit(X_tr, target[idx_tr]); p1=m1.predict(X_va)
            tlog=np.log1p(np.maximum(target,0)); m2=lgb.LGBMRegressor(**params).fit(X_tr, tlog[idx_tr]); p2=np.expm1(np.clip(m2.predict(X_va), None, 20))
            return p2 if mean_absolute_error(target[idx_va], p2) < mean_absolute_error(target[idx_va], p1) else p1
        oof_mid[va]=fit_predict(y_mid,tr,va); oof_w[va]=np.clip(fit_predict(y_w,tr,va),0,None)
    pred_b = np.maximum(0, oof_mid - oof_w/2.0); pred_u = np.maximum(0, oof_mid + oof_w/2.0)
    sw = pred_b > pred_u
    if sw.any():
        tb,tu = pred_b.copy(), pred_u.copy()
        pred_b[sw]=np.minimum(tb[sw],tu[sw]); pred_u[sw]=np.maximum(tb[sw],tu[sw])
    mb = mean_absolute_error(y_b, pred_b); mu = mean_absolute_error(y_u, pred_u)
    return Result("LGBM-2Stage(SVD+num+domain)", np.column_stack([pred_b,pred_u]), float((mb+mu)/2),
                  {"mae_bottom":float(mb),"mae_up":float(mu)})

In [35]:
# ---- NEW: BERT-Embedding + Ridge two-stage (no AutoML) ----
def bert_mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(1)
    counts = mask.sum(1).clamp(min=1e-9)
    return summed / counts

def embed_texts(texts: List[str], model_name: str, max_len=160, batch=32, device=None):
    assert HAS_TRANSFORMERS, "transformers not installed"
    tok = AutoTokenizer.from_pretrained(model_name)
    mdl = AutoModel.from_pretrained(model_name)
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    mdl.to(device); mdl.eval()
    embs = []
    with torch.no_grad():
        for i in range(0, len(texts), batch):
            t = texts[i:i+batch]
            enc = tok(list(t), padding=True, truncation=True, max_length=max_len, return_tensors="pt")
            enc = {k:v.to(device) for k,v in enc.items()}
            out = mdl(**enc)
            pooled = bert_mean_pool(out.last_hidden_state, enc["attention_mask"])
            embs.append(pooled.cpu().numpy())
    return np.vstack(embs)

def bert_ridge_two_stage_cv(train_df: pd.DataFrame, y_bu: np.ndarray, ctx: Dict,
                            folds=5, model_name="indobenchmark/indobert-base-p2",
                            max_len=160, batch=32) -> Result:
    assert HAS_TRANSFORMERS, "transformers not available"
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid=(y_b+y_u)/2.0; y_w=(y_u-y_b)
    # precompute embeddings for ALL train once
    joined = join_text_fields(train_df, TEXT_COLS).tolist()
    E = embed_texts(joined, model_name=model_name, max_len=max_len, batch=batch)
    # add numeric features
    Xn = build_numeric_features(train_df, ctx).values
    E_all = np.concatenate([E, Xn], axis=1)

    oof_mid = np.zeros(len(train_df)); oof_w = np.zeros(len(train_df))
    alphas=[0.1,0.3,0.5,1,2,3,5,7.5,10,15,25,50]
    for tr,va in stratified_kfold_by_y(y_mid, n_splits=folds):
        X_tr, X_va = E_all[tr], E_all[va]
        def fit_predict(target):
            r1 = RidgeCV(alphas=alphas).fit(X_tr, target[tr]); p1 = r1.predict(X_va)
            tlog=np.log1p(np.maximum(target,0)); r2=RidgeCV(alphas=alphas).fit(X_tr, tlog[tr]); p2=np.expm1(np.clip(r2.predict(X_va), None, 20))
            return p2 if mean_absolute_error(target[va], p2) < mean_absolute_error(target[va], p1) else p1
        oof_mid[va] = fit_predict(y_mid); oof_w[va] = np.clip(fit_predict(y_w), 0, None)

    pred_b = np.maximum(0, oof_mid - oof_w/2.0); pred_u = np.maximum(0, oof_mid + oof_w/2.0)
    sw = pred_b > pred_u
    if sw.any():
        tb,tu = pred_b.copy(), pred_u.copy()
        pred_b[sw]=np.minimum(tb[sw],tu[sw]); pred_u[sw]=np.maximum(tb[sw],tu[sw])
    mb = mean_absolute_error(y_b, pred_b); mu = mean_absolute_error(y_u, pred_u)
    return Result("BERTEmbed-2Stage(Ridge+num)", np.column_stack([pred_b,pred_u]),
                  float((mb+mu)/2), {"mae_bottom":float(mb),"mae_up":float(mu),
                  "model":model_name, "max_len":int(max_len)})


In [36]:
# ----------------- meta-stacking & calibration -----------------
def fit_meta_and_calibrate(res_list: List[Result], y: np.ndarray):
    # stack per-target
    oof_b = np.column_stack([r.oof[:,0] for r in res_list])
    oof_u = np.column_stack([r.oof[:,1] for r in res_list])
    ridge_grid=[0.001,0.01,0.1,1,3,10,30,100]
    meta_b = RidgeCV(alphas=ridge_grid).fit(oof_b, y[:,0])
    meta_u = RidgeCV(alphas=ridge_grid).fit(oof_u, y[:,1])
    oof_meta = np.column_stack([meta_b.predict(oof_b), meta_u.predict(oof_u)])

    cal_b = IsotonicRegression(out_of_bounds="clip").fit(oof_meta[:,0], y[:,0])
    cal_u = IsotonicRegression(out_of_bounds="clip").fit(oof_meta[:,1], y[:,1])

    mb = mean_absolute_error(y[:,0], cal_b.transform(oof_meta[:,0]))
    mu = mean_absolute_error(y[:,1], cal_u.transform(oof_meta[:,1]))
    return {"meta_b":meta_b, "meta_u":meta_u, "cal_b":cal_b, "cal_u":cal_u,
            "oof_mae": (mb+mu)/2, "oof_mb": mb, "oof_mu": mu}


In [37]:
# ----------------- full-train & predict each base -----------------
def predict_test_ridge(train_df, test_df, y_bu, ctx):
    y_b = y_bu[:,0]; y_u = y_bu[:,1]
    tf, Xw_tr, Xc_tr, _, Xn_tr = build_all_features(train_df, ctx, fit_text_on_df=True)
    _,  Xw_te, Xc_te, _, Xn_te = build_all_features(test_df,  ctx, tf_word_char=tf)
    scaler=MaxAbsScaler(); Xn_s=scaler.fit_transform(Xn_tr.values.astype(float)); Xn_ts=scaler.transform(Xn_te.values.astype(float))
    X=hstack([csr_matrix(Xn_s),Xw_tr,Xc_tr]).tocsr(); Xte=hstack([csr_matrix(Xn_ts),Xw_te,Xc_te]).tocsr()
    alphas=[0.1,0.3,0.5,1,2,3,5,7.5,10,15,25,50]
    y_mid=(y_b+y_u)/2; y_w=(y_u-y_b)
    r1=RidgeCV(alphas=alphas).fit(X,y_mid); p1=r1.predict(Xte)
    r2=RidgeCV(alphas=alphas).fit(X,np.log1p(np.maximum(y_mid,0))); p2=np.expm1(np.clip(r2.predict(Xte), None, 20))
    mid=np.minimum(p1,p2)
    r3=RidgeCV(alphas=alphas).fit(X,y_w); q1=r3.predict(Xte)
    r4=RidgeCV(alphas=alphas).fit(X,np.log1p(np.maximum(y_w,0))); q2=np.expm1(np.clip(r4.predict(Xte), None, 20))
    width=np.maximum(0,np.minimum(q1,q2)); b=np.maximum(0, mid-width/2); u=np.maximum(0, mid+width/2)
    return np.column_stack([b,u])

def predict_test_hgb(train_df, test_df, y_bu, ctx, svd_dim=500):
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid=(y_b+y_u)/2; y_w=(y_u-y_b)
    tf, _, _, Xs_tr, Xn_tr = build_all_features(train_df, ctx, fit_text_on_df=True, svd_dim=svd_dim)
    _,  _, _, Xs_te, Xn_te = build_all_features(test_df,  ctx, tf_word_char=tf, svd_dim=svd_dim)
    X=np.concatenate([Xs_tr,Xn_tr.values],axis=1); Xte=np.concatenate([Xs_te,Xn_te.values],axis=1)
    m1=HistGradientBoostingRegressor(random_state=SEED).fit(X,y_mid); p1=m1.predict(Xte)
    m2=HistGradientBoostingRegressor(random_state=SEED).fit(X,np.log1p(np.maximum(y_mid,0))); p2=np.expm1(np.clip(m2.predict(Xte), None, 20))
    mid=np.minimum(p1,p2); m3=HistGradientBoostingRegressor(random_state=SEED).fit(X,y_w); q1=m3.predict(Xte)
    m4=HistGradientBoostingRegressor(random_state=SEED).fit(X,np.log1p(np.maximum(y_w,0))); q2=np.expm1(np.clip(m4.predict(Xte), None, 20))
    width=np.maximum(0,np.minimum(q1,q2)); b=np.maximum(0, mid-width/2); u=np.maximum(0, mid+width/2)
    return np.column_stack([b,u])

def predict_test_lgbm(train_df, test_df, y_bu, ctx, svd_dim=500):
    assert HAS_LGBM
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid=(y_b+y_u)/2; y_w=(y_u-y_b)
    tf, _, _, Xs_tr, Xn_tr = build_all_features(train_df, ctx, fit_text_on_df=True, svd_dim=svd_dim)
    _,  _, _, Xs_te, Xn_te = build_all_features(test_df,  ctx, tf_word_char=tf, svd_dim=svd_dim)
    X=np.concatenate([Xs_tr,Xn_tr.values],axis=1); Xte=np.concatenate([Xs_te,Xn_te.values],axis=1)
    params=dict(objective="regression", learning_rate=0.05, n_estimators=800, num_leaves=63, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0, random_state=SEED, verbose=-1)
    m1=lgb.LGBMRegressor(**params).fit(X,y_mid); p1=m1.predict(Xte)
    m2=lgb.LGBMRegressor(**params).fit(X,np.log1p(np.maximum(y_mid,0))); p2=np.expm1(np.clip(m2.predict(Xte), None, 20))
    mid=np.minimum(p1,p2); m3=lgb.LGBMRegressor(**params).fit(X,y_w); q1=m3.predict(Xte)
    m4=lgb.LGBMRegressor(**params).fit(X,np.log1p(np.maximum(y_w,0))); q2=np.expm1(np.clip(m4.predict(Xte), None, 20))
    width=np.maximum(0,np.minimum(q1,q2)); b=np.maximum(0, mid-width/2); u=np.maximum(0, mid+width/2)
    return np.column_stack([b,u])

def predict_test_bert(train_df, test_df, y_bu, ctx, model_name="indobenchmark/indobert-base-p2", max_len=160, batch=32):
    assert HAS_TRANSFORMERS
    y_b = y_bu[:,0]; y_u = y_bu[:,1]; y_mid=(y_b+y_u)/2; y_w=(y_u-y_b)
    Et = embed_texts(join_text_fields(train_df, TEXT_COLS).tolist(), model_name, max_len, batch)
    En = build_numeric_features(train_df, ctx).values
    X = np.concatenate([Et, En], axis=1)
    Ee = embed_texts(join_text_fields(test_df, TEXT_COLS).tolist(), model_name, max_len, batch)
    En_te = build_numeric_features(test_df, ctx).values
    Xte = np.concatenate([Ee, En_te], axis=1)
    alphas=[0.1,0.3,0.5,1,2,3,5,7.5,10,15,25,50]
    r1=RidgeCV(alphas=alphas).fit(X,y_mid); p1=r1.predict(Xte)
    r2=RidgeCV(alphas=alphas).fit(X,np.log1p(np.maximum(y_mid,0))); p2=np.expm1(np.clip(r2.predict(Xte), None, 20))
    mid=np.minimum(p1,p2); r3=RidgeCV(alphas=alphas).fit(X,y_w); q1=r3.predict(Xte)
    r4=RidgeCV(alphas=alphas).fit(X,np.log1p(np.maximum(y_w,0))); q2=np.expm1(np.clip(r4.predict(Xte), None, 20))
    width=np.maximum(0,np.minimum(q1,q2)); b=np.maximum(0, mid-width/2); u=np.maximum(0, mid+width/2)
    return np.column_stack([b,u])



In [38]:
# ----------------- main -----------------
def main(argv=None):
    import argparse, json
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    ap = argparse.ArgumentParser(allow_abbrev=False)

    # 1) DEFINISIKAN SEMUA ARGUMEN DULU
    ap.add_argument("--train", default="train.csv")
    ap.add_argument("--test",  default="test.csv")
    ap.add_argument("--out",   default="submission.csv")
    ap.add_argument("--folds", type=int, default=5)
    ap.add_argument("--svd_dim", type=int, default=500)
    ap.add_argument("--plot", default="model_mae.png")
    ap.add_argument("--top_skills", type=int, default=50)
    ap.add_argument("--top_cities", type=int, default=20)
    ap.add_argument("--umr_boost_pct", type=float, default=0.02)
    ap.add_argument("--umr_cities", type=str,
                    default="kota bekasi,kabupaten karawang,kabupaten bekasi,dki jakarta,kota depok")
    # BERT options
    ap.add_argument("--use_bert", action="store_true")
    ap.add_argument("--bert_model", type=str, default="indobenchmark/indobert-base-p2")
    ap.add_argument("--bert_maxlen", type=int, default=160)
    ap.add_argument("--bert_batch", type=int, default=32)
    ap.add_argument("--include_lgbm", action="store_true")

    # 2) BARU DIPARSE
    if argv is None:
        # di notebook, abaikan argumen aneh dari kernel Jupyter
        args, _ = ap.parse_known_args()
    else:
        args = ap.parse_args(argv)

    # ====== kode kamu di bawah ini (tidak diubah) ======
    train_df = pd.read_csv(args.train)
    test_df  = pd.read_csv(args.test)
    y = train_df[[TARGET_B, TARGET_U]].values.astype(float)

    # context (skills, cities, UMR)
    top_sk = top_k_skills(train_df.get("Tech Stack", pd.Series([], dtype=str)), k=args.top_skills)
    top_ct = top_k_cities(train_df.get("Domisili", pd.Series([], dtype=str)), k=args.top_cities)
    high_umr_set = set(normalize_city(s.strip()) for s in str(args.umr_cities).split(",") if s.strip())
    ctx = {"top_skills": set(top_sk), "top_cities": top_ct,
           "high_umr_set": high_umr_set, "umr_boost_pct": float(args.umr_boost_pct)}

    results: List[Result] = []

    print("CV: HGB two-stage (+domain feats) ...")
    r_h = hgb_two_stage_cv(train_df, y, ctx, folds=args.folds, svd_dim=args.svd_dim)
    print(" ->", r_h.details, "avg:", r_h.mae); results.append(r_h)

    print("CV: Ridge two-stage (+domain feats) ...")
    r_r = ridge_two_stage_cv(train_df, y, ctx, folds=args.folds)
    print(" ->", r_r.details, "avg:", r_r.mae); results.append(r_r)

    if getattr(args, "include_lgbm", False) and 'HAS_LGBM' in globals() and HAS_LGBM:
        print("CV: LGBM two-stage (+domain feats) ...")
        r_l = lgbm_two_stage_cv(train_df, y, ctx, folds=args.folds, svd_dim=args.svd_dim)
        print(" ->", r_l.details, "avg:", r_l.mae); results.append(r_l)

    if getattr(args, "use_bert", False) and 'HAS_TRANSFORMERS' in globals() and HAS_TRANSFORMERS \
       and "bert_ridge_two_stage_cv" in globals():
        print(f"CV: BERT-Embedding two-stage ({args.bert_model}) ...")
        r_b = bert_ridge_two_stage_cv(train_df, y, ctx, folds=args.folds,
                                      model_name=args.bert_model, max_len=args.bert_maxlen, batch=args.bert_batch)
        print(" ->", r_b.details, "avg:", r_b.mae); results.append(r_b)

    results = sorted(results, key=lambda r: r.mae)

    meta = fit_meta_and_calibrate(results, y)
    print(f"[STACK] Calibrated OOF MAE -> avg: {meta['oof_mae']:.0f} (bottom {meta['oof_mb']:.0f} | up {meta['oof_mu']:.0f})")

    names  = [r.name for r in results] + ["Meta+Calibrated"]
    scores = [r.mae for r in results]  + [meta["oof_mae"]]
    plt.figure(figsize=(9,5))
    plt.barh(names, scores); plt.xlabel("OOF MAE (smaller is better)")
    plt.title("Model Comparison + Meta"); plt.gca().invert_yaxis(); plt.tight_layout()
    plt.savefig(args.plot)
    print(f"Saved plot: {args.plot}")

    preds_test = []
    preds_test.append(predict_test_hgb(train_df, test_df, y, ctx, svd_dim=args.svd_dim))
    preds_test.append(predict_test_ridge(train_df, test_df, y, ctx))
    if getattr(args, "include_lgbm", False) and 'HAS_LGBM' in globals() and HAS_LGBM:
        preds_test.append(predict_test_lgbm(train_df, test_df, y, ctx, svd_dim=args.svd_dim))
    if getattr(args, "use_bert", False) and 'HAS_TRANSFORMERS' in globals() and HAS_TRANSFORMERS \
       and "predict_test_bert" in globals():
        preds_test.append(predict_test_bert(train_df, test_df, y, ctx,
                                            model_name=args.bert_model, max_len=args.bert_maxlen, batch=args.bert_batch))

    test_stack_b = np.column_stack([p[:,0] for p in preds_test])
    test_stack_u = np.column_stack([p[:,1] for p in preds_test])
    pred_b = meta["meta_b"].predict(test_stack_b)
    pred_u = meta["meta_u"].predict(test_stack_u)

    umr_mult = compute_umr_multiplier(test_df, ctx)
    pred_b *= umr_mult; pred_u *= umr_mult

    pred_b = meta["cal_b"].transform(pred_b)
    pred_u = meta["cal_u"].transform(pred_u)
    sw = pred_b > pred_u
    if sw.any():
        tb, tu = pred_b.copy(), pred_u.copy()
        pred_b[sw] = np.minimum(tb[sw], tu[sw])
        pred_u[sw] = np.maximum(tb[sw], tu[sw])

    sub = pd.DataFrame({
        ID_COL: test_df[ID_COL].values,
        TARGET_B: np.round(pred_b).astype(int),
        TARGET_U: np.round(pred_u).astype(int),
    })
    sub.to_csv(args.out, index=False)

    report = {
        "bases": [{ "name": r.name, "mae": float(r.mae), **r.details } for r in results],
        "stack_oof_mae": float(meta["oof_mae"]),
        "umr": {"boost_pct": float(args.umr_boost_pct), "cities": list(high_umr_set)},
        "params": {"folds": args.folds, "svd_dim": args.svd_dim,
                   "use_bert": bool(getattr(args, "use_bert", False)),
                   "bert_model": args.bert_model if getattr(args, "use_bert", False) else None}
    }
    print(json.dumps(report, indent=2))
    print(f"Saved submission: {args.out}")


In [39]:
main([
  "--train","train.csv",
  "--test","test.csv",
  "--out","submission.csv",
  "--folds","5",
  "--svd_dim","500",
  "--top_skills","50",
  "--top_cities","20",
  "--umr_boost_pct","0.02",
  "--umr_cities","kota bekasi,kabupaten karawang,kabupaten bekasi,dki jakarta,kota depok",
  "--include_lgbm",           # kalau mau aktifkan LGBM
  # "--use_bert",             # aktifkan kalau fungsi BERT kamu memang ada
  # "--bert_model","indobenchmark/indobert-base-p2",
  # "--bert_maxlen","160",
  # "--bert_batch","32",
])


CV: HGB two-stage (+domain feats) ...
 -> {'mae_bottom': 1029867.5890121871, 'mae_up': 1489490.4233241233} avg: 1259679.0061681552
CV: Ridge two-stage (+domain feats) ...
 -> {'mae_bottom': 1062618.568444418, 'mae_up': 1590902.098827598} avg: 1326760.333636008
[STACK] Calibrated OOF MAE -> avg: 1079313 (bottom 860520 | up 1298106)
Saved plot: model_mae.png
{
  "bases": [
    {
      "name": "HGB-2Stage(SVD+num+domain)",
      "mae": 1259679.0061681552,
      "mae_bottom": 1029867.5890121871,
      "mae_up": 1489490.4233241233
    },
    {
      "name": "Ridge-2Stage(TFIDF+num+domain)",
      "mae": 1326760.333636008,
      "mae_bottom": 1062618.568444418,
      "mae_up": 1590902.098827598
    }
  ],
  "stack_oof_mae": 1079313.0185164944,
  "umr": {
    "boost_pct": 0.02,
    "cities": [
      "kabupaten bekasi",
      "depok",
      "kabupaten karawang",
      "bekasi",
      "jakarta"
    ]
  },
  "params": {
    "folds": 5,
    "svd_dim": 500,
    "use_bert": false,
    "bert_model":