In [2]:
# Minimal cleaner (only if catalog_content_clean doesn't exist yet)
if "catalog_content_clean" not in locals() and "catalog_content_clean" not in train.columns if 'train' in locals() else [False]:
    import re
    def clean_text(s) -> str:
        s = "" if s is None else str(s)
        s = s.lower()
        s = re.sub(r"http\S+|www\S+|https\S+", "", s)
        s = re.sub(r"[^a-z0-9 ]+", " ", s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()


In [3]:
# Robust project-root + data loader for 02_stage2.ipynb

import re, gc, math, numpy as np, pandas as pd
from pathlib import Path

def find_project_root():
    """Walk upward until we find a folder that actually contains data/train.csv (or common markers)."""
    p = Path.cwd()
    for _ in range(8):
        # Preferred: explicit presence of data files
        if (p / "data" / "train.csv").exists() and (p / "data" / "test.csv").exists():
            return p
        # Fallback: repo markers + data dir exists
        if any((p / m).exists() for m in ("requirements.txt", ".git", "HANDOFF.md")) and (p / "data").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not locate project root containing a 'data/' folder.")

ROOT = find_project_root()
DATA = ROOT / "data"
ART  = ROOT / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

print("Notebook CWD :", Path.cwd())
print("Project ROOT :", ROOT)
print("DATA exists? :", DATA.exists(), "|", (DATA / "train.csv").exists(), (DATA / "test.csv").exists())
print("ART path     :", ART)

# Load data
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

# Choose text column (use cleaned if present, else raw)
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Safe string views for modeling
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values


Notebook CWD : d:\amazon ML challenge\notebooks
Project ROOT : d:\amazon ML challenge
DATA exists? : True | True True
ART path     : d:\amazon ML challenge\artifacts


In [8]:
# === Stage-2B minimal: word TF-IDF + cheap meta features → pred_tm + save CSV ===
from pathlib import Path
import re, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

# Resolve paths and data (works whether you're in notebooks/ or project root)
ROOT = globals().get("ROOT", Path.cwd())
ROOT = ROOT if isinstance(ROOT, Path) else Path(ROOT)
DATA = Path(globals().get("DATA", ROOT / "data"))
ART  = Path(globals().get("ART",  ROOT / "artifacts"))
ART.mkdir(parents=True, exist_ok=True)

# Load data if not already present
if "train" not in globals() or "test" not in globals():
    train = pd.read_csv(DATA / "train.csv")
    test  = pd.read_csv(DATA / "test.csv")

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values

# ---- meta feature extractor ----
def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)

    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)

    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)

    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)

    df = pd.DataFrame({
        "num_count": num_count,
        "max_num": max_num,
        "min_num": min_num,
        "pack_n":  pack_n,
    })
    df = pd.concat([df, unit_df], axis=1).fillna(0.0)
    return df

meta_tr = extract_meta(X_text_tr)
meta_te = extract_meta(X_text_te)

train_aug = pd.DataFrame({TEXT_COL: X_text_tr})
test_aug  = pd.DataFrame({TEXT_COL: X_text_te})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

numeric_cols = meta_tr.columns.tolist()

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

model = Ridge(alpha=1.0, random_state=42)
pipe_tm = Pipeline([("ct", ct), ("ridge", model)])

# Fit full + predict
pipe_tm.fit(train_aug, y)
pred_tm = pipe_tm.predict(test_aug)

# Save submission for blending
out_tm = ART / "submission_word_meta.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_tm}).to_csv(out_tm, index=False)
print("Saved:", out_tm.resolve(), "| rows:", len(pred_tm))


Saved: D:\amazon ML challenge\artifacts\submission_word_meta.csv | rows: 75000


In [9]:
# Simple average of the two Stage-2 submissions
import pandas as pd
from pathlib import Path

ART = Path(ART)  # reuse from earlier if defined
df_wc = pd.read_csv(ART / "submission_wc.csv")
df_tm = pd.read_csv(ART / "submission_word_meta.csv")

df = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc", "_tm"))
df["price"] = 0.5*df["price_wc"] + 0.5*df["price_tm"]
out = ART / "submission_ensemble_v1.csv"
df[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(df))


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v1.csv | rows: 75000


In [13]:
from pathlib import Path
import pandas as pd

# Reuse ART if defined; else resolve it from project root
ART = Path(globals().get("ART", Path.cwd() / "artifacts"))

print("ART =", ART.resolve())
print("Available submissions:", [p.name for p in ART.glob("submission_*.csv")])

df = pd.read_csv(ART / "submission_ensemble_v1.csv")
assert len(df)==75000 and df["sample_id"].is_unique and not df["price"].isna().any()
print(df["price"].min(), df["price"].max(), df["price"].median())


ART = D:\amazon ML challenge\artifacts
Available submissions: ['submission_ensemble_v1.csv', 'submission_final.csv', 'submission_wc.csv', 'submission_word_meta.csv']
-57.24441578355 427.01332224186393 19.36302922582783


In [16]:
from pathlib import Path
import pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge

ART = Path(globals().get("ART", Path.cwd() / "artifacts"))
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Encode
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
tr_vec = model.encode(train[TEXT_COL].fillna("").tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
te_vec = model.encode(test[TEXT_COL].fillna("").tolist(),  batch_size=512, show_progress_bar=True, convert_to_numpy=True)

# Fit + predict
ridge_emb = Ridge(alpha=1.0, random_state=42).fit(tr_vec, train["price"].values)
pred_emb = ridge_emb.predict(te_vec)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_emb}).to_csv(ART/"submission_sbert.csv", index=False)

# 3-way blend with existing files
df_wc = pd.read_csv(ART/"submission_wc.csv")
df_tm = pd.read_csv(ART/"submission_word_meta.csv")
df_sb = pd.read_csv(ART/"submission_sbert.csv")

m = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc","_tm")).merge(df_sb, on="sample_id")
m["price"] = 0.4*m["price_wc"] + 0.4*m["price_tm"] + 0.2*m["price"]
out = ART/"submission_ensemble_v2.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(m))


Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]
Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v2.csv | rows: 75000


In [18]:
import numpy as np

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return float(np.mean(np.abs(y_true - y_pred) / denom) * 100.0)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
import numpy as np
import pandas as pd

# Reuse your word+char FeatureUnion pipe from Stage-2A
wc_features = pipe_wc.named_steps["features"]  # same vectorizers
base = Ridge(alpha=1.2, random_state=42)

log_ridge = TransformedTargetRegressor(
    regressor=Ridge(alpha=1.2, random_state=42),
    func=np.log1p,
    inverse_func=np.expm1
)

pipe_wc_log = Pipeline([
    ("features", wc_features),
    ("ridge", log_ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_log.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_log.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, log target): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

# Fit full + save submission
pipe_wc_log.fit(X_text_tr, y)
pred_wc_log = pipe_wc_log.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_log}).to_csv(ART/"submission_wc_log.csv", index=False)
print("Saved:", (ART/"submission_wc_log.csv").resolve())


CV SMAPE (word+char, log target): 52.18% ± 0.38%
Saved: D:\amazon ML challenge\artifacts\submission_wc_log.csv


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
import numpy as np
import pandas as pd

# Reuse your word+char FeatureUnion pipe from Stage-2A
wc_features = pipe_wc.named_steps["features"]  # same vectorizers
base = Ridge(alpha=1.2, random_state=42)

log_ridge = TransformedTargetRegressor(
    regressor=Ridge(alpha=1.2, random_state=42),
    func=np.log1p,
    inverse_func=np.expm1
)

pipe_wc_log = Pipeline([
    ("features", wc_features),
    ("ridge", log_ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_log.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_log.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, log target): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

# Fit full + save submission
pipe_wc_log.fit(X_text_tr, y)
pred_wc_log = pipe_wc_log.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_log}).to_csv(ART/"submission_wc_log.csv", index=False)
print("Saved:", (ART/"submission_wc_log.csv").resolve())


CV SMAPE (word+char, log target): 52.18% ± 0.38%
Saved: D:\amazon ML challenge\artifacts\submission_wc_log.csv


In [21]:
from sklearn.linear_model import TweedieRegressor

pipe_wc_gamma = Pipeline([
    ("features", wc_features),
    ("glm", TweedieRegressor(power=2.0, link="log", alpha=1e-3, max_iter=3000, tol=1e-6, warm_start=True)),
])

scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_gamma.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_gamma.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, Gamma GLM): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

pipe_wc_gamma.fit(X_text_tr, y)
pred_wc_gamma = pipe_wc_gamma.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_gamma}).to_csv(ART/"submission_wc_gamma.csv", index=False)
print("Saved:", (ART/"submission_wc_gamma.csv").resolve())


CV SMAPE (word+char, Gamma GLM): 64.09% ± 0.47%
Saved: D:\amazon ML challenge\artifacts\submission_wc_gamma.csv


In [22]:
import pandas as pd

cands = []
for fname in ["submission_wc.csv", "submission_wc_log.csv", "submission_wc_gamma.csv"]:
    p = (ART/fname)
    if p.exists(): cands.append(pd.read_csv(p).rename(columns={"price": fname.replace(".csv","")}))

m = cands[0]
for df in cands[1:]:
    m = m.merge(df, on="sample_id")

# If both log and gamma exist, try a 60/40 toward the lower CV SMAPE
cols = [c for c in m.columns if c!="sample_id"]
if "submission_wc_log" in cols and "submission_wc_gamma" in cols:
    m["price"] = 0.6*m["submission_wc_log"] + 0.4*m["submission_wc_gamma"]
elif "submission_wc_log" in cols:
    m["price"] = m["submission_wc_log"]
else:
    m["price"] = m[cols[0]]

out = ART/"submission_wc_smape_opt.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve())


Saved: D:\amazon ML challenge\artifacts\submission_wc_smape_opt.csv


In [23]:
import numpy as np
bins = np.clip(np.floor(np.log1p(y)), 0, 10).astype(int)
# If you want to keep KFold, keep the seed and ensure each fold has distribution checked
# Or use StratifiedKFold on bins (regression hack):
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [25]:
# Make the final, portal-ready file from the log-target model
from pathlib import Path
import pandas as pd, numpy as np

ART, DATA = Path(ART), Path(DATA)

df = pd.read_csv(ART/"submission_wc_log.csv")[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

# align to test.csv order (defensive)
test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [26]:
# === Build OOF predictions for two models, tune weights for SMAPE, blend test preds ===
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler

ART, DATA = Path(ART), Path(DATA)

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred)+eps)/2.0
    return float(np.mean(np.abs(y_true-y_pred)/denom)*100.0)

# Rebuild the two models (same configs you used)
identity = FunctionTransformer(lambda s: s, validate=False)

# word+char features (same as Stage-2A)
wc_features = Pipeline([
    ("union", 
     Pipeline(steps=[("id", identity)])),  # placeholder so we can set params next line
])
# We'll attach vectorizers directly via FeatureUnion-like manual mapping:
from sklearn.pipeline import FeatureUnion
wc_features = FeatureUnion([
    ("word", Pipeline([("id", identity), ("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=350_000, min_df=2))])),
    ("char", Pipeline([("id", identity), ("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=300_000, min_df=2))]))
], n_jobs=1)

wc_log = Pipeline([
    ("features", wc_features),
    ("ridge_log", TransformedTargetRegressor(
        regressor=Ridge(alpha=1.2, random_state=42),
        func=np.log1p, inverse_func=np.expm1))
])

# word TF-IDF + meta (same as Stage-2B)
numeric_cols = ["num_count","max_num","min_num","pack_n","vol_ml","wt_g"]

def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)
    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)
    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)
    df = pd.DataFrame({"num_count": num_count, "max_num": max_num, "min_num": min_num, "pack_n": pack_n})
    return pd.concat([df, unit_df], axis=1).fillna(0.0)

train_text = X_text_tr
test_text  = X_text_te
y_vec = y

meta_tr = extract_meta(train_text)
meta_te = extract_meta(test_text)

train_aug = pd.DataFrame({TEXT_COL: train_text})
test_aug  = pd.DataFrame({TEXT_COL: test_text})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

word_meta = Pipeline([("ct", ct), ("ridge", Ridge(alpha=1.0, random_state=42))])

# OOF predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log = np.zeros(len(train_aug))
oof_tm  = np.zeros(len(train_aug))

for tr_idx, va_idx in cv.split(train_aug):
    wc_log.fit(train_text.iloc[tr_idx], y_vec[tr_idx])
    oof_log[va_idx] = wc_log.predict(train_text.iloc[va_idx]).clip(min=1e-6)

    word_meta.fit(train_aug.iloc[tr_idx], y_vec[tr_idx])
    oof_tm[va_idx] = word_meta.predict(train_aug.iloc[va_idx]).clip(min=1e-6)

print("OOF SMAPE — log:", smape(y_vec, oof_log), "| tm:", smape(y_vec, oof_tm))

# Grid search weights to minimize SMAPE on OOF
best_w, best_s = None, 1e9
for w in np.linspace(0, 1, 21):  # 0.00 .. 1.00 step 0.05
    blend = w*oof_log + (1-w)*oof_tm
    s = smape(y_vec, blend)
    if s < best_s:
        best_s, best_w = s, w
print(f"Best OOF SMAPE: {best_s:.2f}% at w_log={best_w:.2f}, w_tm={1-best_w:.2f}")

# Fit both on full train and blend test with tuned weights
wc_log.fit(train_text, y_vec)
p_log = wc_log.predict(test_text).clip(min=1e-6)

word_meta.fit(train_aug, y_vec)
p_tm = word_meta.predict(test_aug).clip(min=1e-6)

p_blend = best_w*p_log + (1-best_w)*p_tm

# Save SMAPE-optimized blend
out = ART/"submission_wc_smape_blend.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": p_blend}).to_csv(out, index=False)
print("Saved:", out.resolve())


OOF SMAPE — log: 52.18425790496487 | tm: 66.27008136434968
Best OOF SMAPE: 52.18% at w_log=1.00, w_tm=0.00
Saved: D:\amazon ML challenge\artifacts\submission_wc_smape_blend.csv


In [27]:
import pandas as pd
from pathlib import Path

ART, DATA = Path(ART), Path(DATA)
best = "submission_wc_smape_blend.csv"  # or "submission_wc_log.csv" if you prefer

df = pd.read_csv(ART/best)[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [28]:
from pathlib import Path
import pandas as pd
ART, DATA = Path(ART), Path(DATA)

df = pd.read_csv(ART/"submission_wc_log.csv")[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all(), "Missing predictions"

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [17]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    # Avoid divisions by 0 (when both true and pred are 0)
    mask = denom != 0
    out = np.zeros_like(denom)
    out[mask] = diff[mask] / denom[mask]
    return np.mean(out) * 100.0

# Example: evaluate smape for the word+char pipeline 'pipe_wc'
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc.predict(X_text_tr.iloc[va])
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char ridge): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")


CV SMAPE (word+char ridge): 68.12% ± 0.19%


In [29]:
# Grid search a few strong configs for SMAPE (fast)
import numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred)+eps)/2.0
    return float(np.mean(np.abs(y_true-y_pred)/denom)*100.0)

cfgs = [
    # (word_max, char_max, alpha)
    (300_000, 250_000, 1.0),
    (350_000, 300_000, 1.2),
    (400_000, 350_000, 1.2),
    (300_000, 300_000, 1.5),
]
best = (9e9, None)

for wmax,cmax,alpha in cfgs:
    feats = FeatureUnion([
        ("word", Pipeline([("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=wmax, min_df=2))])),
        ("char", Pipeline([("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=cmax, min_df=2))]))
    ], n_jobs=1)
    pipe = Pipeline([
        ("features", feats),
        ("reg", TransformedTargetRegressor(
            regressor=Ridge(alpha=alpha, random_state=42),
            func=np.log1p, inverse_func=np.expm1))
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores=[]
    for tr,va in cv.split(X_text_tr):
        pipe.fit(X_text_tr.iloc[tr], y[tr])
        p = pipe.predict(X_text_tr.iloc[va]).clip(min=1e-6)
        scores.append(smape(y[va], p))
    m = float(np.mean(scores)); s=float(np.std(scores))
    print((wmax,cmax,alpha), f"{m:.2f} ± {s:.2f}")
    if m < best[0]:
        best = (m,(wmax,cmax,alpha))

best


(300000, 250000, 1.0) 52.20 ± 0.38
(350000, 300000, 1.2) 52.18 ± 0.38
(400000, 350000, 1.2) 52.14 ± 0.38
(300000, 300000, 1.5) 52.28 ± 0.39


(52.14107187166095, (400000, 350000, 1.2))

In [30]:
# Split training by price bins; train one log-target model per bin; predict all test rows with all, then soft-blend by bin probs
import numpy as np, pandas as pd
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor

# bins on log-price
ylog = np.log1p(y)
cuts = np.quantile(ylog, [0, .2, .4, .6, .8, 1.0])
bin_id = np.digitize(ylog, cuts[1:-1], right=False)

# train KNN on meta signals to estimate bin probs at test time (no labels for test)
def extract_light_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    lens = s.str.len()
    nums = s.str.count(r"(?<![a-zA-Z])\d+(?:\.\d+)?")
    upp = s.str.count(r"\b[A-Z]{2,}\b")
    return pd.DataFrame({"len":lens, "nums":nums, "upp":upp}).astype(float)

meta_tr_l = extract_light_meta(X_text_tr)
meta_te_l = extract_light_meta(X_text_te)

# bin classifier proxy using KNN on meta (unsupervised-ish routing)
knn = KNeighborsRegressor(n_neighbors=25, weights="distance")
knn.fit(meta_tr_l, bin_id.astype(float))
bin_pred = knn.predict(meta_te_l)  # continuous; we’ll convert to soft weights
# Softmax over distances not perfect; approximate soft assignment:
# For stability, build one expert per bin and later weight by proximity to each bin center in ylog space
centers = np.array([np.mean(ylog[bin_id==b]) for b in range(5)])

# Train 5 experts (same architecture as best log-target)
wmax,cmax,alpha = best[1]
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

experts=[]
feats_tpl = lambda: FeatureUnion([
    ("word", Pipeline([("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=wmax, min_df=2))])),
    ("char", Pipeline([("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=cmax, min_df=2))]))
], n_jobs=1)

for b in range(5):
    mask = (bin_id==b)
    reg = TransformedTargetRegressor(Ridge(alpha=alpha, random_state=42), func=np.log1p, inverse_func=np.expm1)
    pipe = Pipeline([("features", feats_tpl()), ("reg", reg)])
    pipe.fit(X_text_tr[mask], y[mask])
    experts.append(pipe)

# Blend experts by proximity in meta space -> use knn output to map to center weights
# Map predicted continuous bin value to weights over 5 centers using RBF-like kernel
def soft_weights(x, centers, tau=0.8):
    # x ~ predicted bin position (0..4); centers -> 0..4
    d = np.abs(centers - x)  # but centers are in ylog; map x to center index scale:
    # better: linearly map x in [0,4] to centers’ indices:
    idx_pos = np.clip(x, 0, 4)
    d = np.abs(np.arange(5) - idx_pos)
    w = np.exp(-d/tau)
    return w / w.sum()

# Predict per expert then soft-blend
preds = []
for pipe in experts:
    preds.append(pipe.predict(X_text_te).clip(min=1e-6))
preds = np.vstack(preds)  # [5, N]

weights = np.vstack([soft_weights(v, centers) for v in np.clip(bin_pred,0,4)])
p_moe = (weights * preds.T).sum(axis=1)

pd.DataFrame({"sample_id": test["sample_id"], "price": p_moe}).to_csv(ART/"submission_wc_log_moe.csv", index=False)
print("Saved:", (ART/"submission_wc_log_moe.csv").resolve())


Saved: D:\amazon ML challenge\artifacts\submission_wc_log_moe.csv


In [31]:
# Better meta features directly from text (brand-ish tokens, pack math, normalized units)
import re

def rich_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    # brand-ish: first token before '-' or '|' or '—' or ':' or '('
    head = s.str.extract(r"^\s*([A-Za-z0-9&'\./]+)", expand=False).fillna("")
    brand_len = head.str.len()
    brand_caps = head.str.contains(r"[A-Z]").astype(int)

    # quantities
    nums = s.str.findall(r"(?<![A-Za-z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else 0.0).astype(float)

    # pack math: “pack of K” or “K pack” and multiply if quantities likely per item
    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), 1), axis=1).astype(float)

    # units normalize
    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm(pairs):
        ml = g = 0.0
        for val,u in pairs:
            v=float(val)
            if u=="ml": ml+=v
            elif u=="l": ml+=v*1000
            elif u=="oz": g+=v*28.3495
            elif u=="g": g+=v
            elif u=="kg": g+=v*1000
            elif u=="lb": g+=v*453.592
        return pd.Series({"vol_ml": ml, "wt_g": g})
    unit_df = uw.apply(norm)

    # total content per pack (proxy)
    total_ml = unit_df["vol_ml"]*pack_n
    total_g  = unit_df["wt_g"]*pack_n

    return pd.DataFrame({
        "brand_len":brand_len.astype(float),
        "brand_caps":brand_caps.astype(float),
        "num_count":num_count,
        "max_num":max_num,
        "pack_n":pack_n,
        "vol_ml":unit_df["vol_ml"].astype(float),
        "wt_g": unit_df["wt_g"].astype(float),
        "total_ml": total_ml.astype(float),
        "total_g": total_g.astype(float),
    }).fillna(0.0)

meta_tr2 = rich_meta(X_text_tr)
meta_te2 = rich_meta(X_text_te)


In [33]:
import torch
assert torch.cuda.is_available(), "CUDA is not available. (It was earlier—did the venv change?)"
print(torch.__version__, torch.cuda.get_device_name(0))


2.6.0+cu124 NVIDIA GeForce RTX 4070 Laptop GPU


In [None]:
from sentence_transformers import SentenceTransformer
from pathlib import Path
from PIL import Image
import time

device = "cuda"
model = SentenceTransformer("clip-ViT-B-32", device=device)

IMG_DIR = Path("images_dl")
sample_ids = test["sample_id"].head(512).tolist()
imgs = [Image.open(IMG_DIR/f"{sid}.jpg").convert("RGB") for sid in sample_ids]

t0 = time.time()
_ = model.encode(imgs, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
dt = time.time()-t0
print(f"OK: {len(imgs)/dt:.1f} img/s on GPU")


In [36]:
# GPU + image sanity + timed mini-encode (no guessing)
import sys, time
from pathlib import Path
from PIL import Image, UnidentifiedImageError
import torch
from sentence_transformers import SentenceTransformer

# 0) Confirm GPU
print("CUDA available:", torch.cuda.is_available())
if not torch.cuda.is_available():
    raise SystemExit("CUDA not available — stop here.")

device = "cuda"
print("GPU:", torch.cuda.get_device_name(0))
print("Torch:", torch.__version__)

# 1) Point to the ACTUAL images folder used when you downloaded them
#    Change this if you used a different folder (e.g., 'images' or 'data/images')
IMG_DIR = Path("images_dl")  # <-- tweak if needed
print("IMG_DIR:", IMG_DIR.resolve())

# 2) Collect the first 512 test ids and check how many image files exist
ids = test["sample_id"].head(512).tolist()
paths = [IMG_DIR / f"{sid}.jpg" for sid in ids]
exist_mask = [p.exists() for p in paths]
num_exist = sum(exist_mask)
print(f"Found {num_exist}/512 image files")

# If very few exist, you’re pointing to the wrong folder. Show a couple examples:
if num_exist < 32:
    print("Example expected path:", paths[0])
    raise SystemExit("Too few images found in IMG_DIR. Fix IMG_DIR and rerun.")

# 3) Load a small, existing batch (fast) and time it
kept = [p for p, ok in zip(paths, exist_mask) if ok][:128]  # 128 is enough to measure
imgs = []
for p in kept:
    try:
        imgs.append(Image.open(p).convert("RGB"))
    except (UnidentifiedImageError, FileNotFoundError) as e:
        # skip bad files
        pass

print(f"Encoding {len(imgs)} images on GPU …", flush=True)

model = SentenceTransformer("clip-ViT-B-32", device=device)
print("ST target device:", getattr(model, "_target_device", None))

t0 = time.perf_counter()
_ = model.encode(
    imgs,
    batch_size=64,                # increase to 96/128 if VRAM allows
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
dt = time.perf_counter() - t0
print(f"OK: {len(imgs)/dt:.1f} img/s on GPU (batch=64, n={len(imgs)})", flush=True)


CUDA available: True
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
Torch: 2.6.0+cu124
IMG_DIR: D:\amazon ML challenge\notebooks\images_dl
Found 0/512 image files
Example expected path: images_dl\100179.jpg


SystemExit: Too few images found in IMG_DIR. Fix IMG_DIR and rerun.

In [37]:
from pathlib import Path

# Try to infer project root the same way we've been doing
def find_root(markers=("requirements.txt", ".git", "data", "HANDOFF.md")):
    p = Path.cwd()
    for _ in range(8):
        if any((p / m).exists() for m in markers):
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
candidates = [
    ROOT/"images_dl",
    ROOT/"images",
    ROOT/"data"/"images",
    ROOT/"notebooks"/"images_dl",
    Path("images_dl"),
    Path("images"),
]

def jpg_count(d: Path) -> int:
    try:
        return sum(1 for _ in d.glob("*.jpg"))
    except Exception:
        return 0

found = [(d, jpg_count(d)) for d in candidates if d.exists()]
found = sorted(found, key=lambda x: x[1], reverse=True)

print("ROOT:", ROOT)
print("Candidates (path, .jpg count):")
for d,c in found:
    print(" ", d, c)

IMG_DIR = None
if found and found[0][1] >= 100:  # heuristic: at least 100 jpgs
    IMG_DIR = found[0][0]
    print("\nSelected IMG_DIR:", IMG_DIR.resolve())
else:
    print("\nNo suitable images folder found in common locations.")


ROOT: d:\amazon ML challenge
Candidates (path, .jpg count):
  d:\amazon ML challenge\images_dl 146588
  d:\amazon ML challenge\images 110

Selected IMG_DIR: D:\amazon ML challenge\images_dl


In [38]:
from PIL import Image, UnidentifiedImageError
from sentence_transformers import SentenceTransformer
import torch, time

assert IMG_DIR is not None and IMG_DIR.exists(), "Set IMG_DIR to the correct folder path."

ids = test["sample_id"].head(512).tolist()
paths = [IMG_DIR / f"{sid}.jpg" for sid in ids]
ok = [p.exists() for p in paths]
print(f"Found {sum(ok)}/512 images in:", IMG_DIR)

imgs = []
for p in [pp for pp,flag in zip(paths,ok) if flag][:128]:
    try:
        imgs.append(Image.open(p).convert("RGB"))
    except (UnidentifiedImageError, FileNotFoundError):
        pass

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("clip-ViT-B-32", device=device)
print("Using:", device, "| ST target device:", getattr(model, "_target_device", None))

t0 = time.perf_counter()
_ = model.encode(imgs, batch_size=64, show_progress_bar=True,
                 convert_to_numpy=True, normalize_embeddings=True)
dt = time.perf_counter() - t0
print(f"OK: {len(imgs)/dt:.1f} img/s on GPU (n={len(imgs)})")


Found 512/512 images in: d:\amazon ML challenge\images_dl


`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


Using: cuda | ST target device: cuda:0


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.23s/it]

OK: 28.7 img/s on GPU (n=128)





In [None]:
# Bin-wise calibration for SMAPE
bins = np.clip(np.floor(np.log1p(y)), 0, 10).astype(int)
pred_oof = o1  # use your best model's OOF (log-tuned)

cal = {}
for b in np.unique(bins):
    mask = (bins==b)
    # simple 1D search over scaling factor
    best_k, best_s = 1.0, 1e9
    for k in np.linspace(0.8, 1.2, 41):
        s = smape(y[mask], np.clip(pred_oof[mask]*k, 1e-6, None))
        if s < best_s: best_s, best_k = s, k
    cal[b] = best_k

# apply to test based on predicted bin (use the KNN-lite bin proxy from step 2 or simpler: use text length proxies)
test_bins = np.clip(np.floor(np.log1p(np.median(y))), 0, 10).astype(int)  # fallback single bin
# If you built 'p_final' above and have no per-row bins, skip or set single multiplier ~1.0

# Example applying a single calibrated multiplier:
# p_final *= cal.get(test_bins, 1.0)


In [None]:
# === GPU image embeddings: root->data->images->throughput->cached full encode ===
import time, sys, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image, UnidentifiedImageError
import torch
from sentence_transformers import SentenceTransformer

# 0) Root + data loader
def find_root(markers=("requirements.txt", ".git", "data", "HANDOFF.md")):
    p = Path.cwd()
    for _ in range(8):
        if any((p / m).exists() for m in markers):
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
DATA = ROOT / "data"
ART  = ROOT / "artifacts"
ART.mkdir(parents=True, exist_ok=True)
EMB  = ART / "emb_cache"
EMB.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("DATA:", DATA)

# Load train/test
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")
print("Loaded train/test:", train.shape, test.shape)

# 1) Locate images folder (pick the one with most .jpg)
candidates = [
    ROOT/"images_dl",
    ROOT/"images",
    ROOT/"data"/"images",
    ROOT/"notebooks"/"images_dl",
]
def jpg_count(d: Path) -> int:
    return sum(1 for _ in d.glob("*.jpg")) if d.exists() else 0

cand_counts = sorted([(d, jpg_count(d)) for d in candidates], key=lambda x:x[1], reverse=True)
for d,c in cand_counts:
    print("Candidate:", d, "jpg:", c)

if cand_counts and cand_counts[0][1] > 0:
    IMG_DIR = cand_counts[0][0]
else:
    raise FileNotFoundError(
        "Could not find images. Set IMG_DIR to your downloaded images folder (with many .jpg files)."
    )
print("Using IMG_DIR:", IMG_DIR.resolve())

# 2) GPU sanity
assert torch.cuda.is_available(), "CUDA not available in this kernel/venv."
device = "cuda"
print("GPU:", torch.cuda.get_device_name(0), "| Torch:", torch.__version__)

# 3) Tiny GPU throughput test (fast)
sample_ids = test["sample_id"].head(256).tolist()
paths = [IMG_DIR / f"{sid}.jpg" for sid in sample_ids]
ok = [p.exists() for p in paths]
print(f"Found {sum(ok)}/256 images for quick test in {IMG_DIR.name}")

imgs = []
for p in [pp for pp,flag in zip(paths,ok) if flag][:128]:
    try:
        imgs.append(Image.open(p).convert("RGB"))
    except (UnidentifiedImageError, FileNotFoundError):
        pass
print("Timing encode on", len(imgs), "imgs ...")

model = SentenceTransformer("clip-ViT-B-32", device=device)
t0 = time.perf_counter()
_ = model.encode(
    imgs,
    batch_size=64,  # increase to 96/128 if VRAM allows
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
dt = time.perf_counter()-t0
if len(imgs) > 0:
    print(f"Throughput: {len(imgs)/dt:.1f} img/s (GPU OK)")

# 4) Full encode with caching (runs once; reuses .npy afterwards)
def load_imgs(ids):
    out = []
    blank = Image.new("RGB", (224,224), color=0)
    for sid in ids:
        p = IMG_DIR / f"{sid}.jpg"
        try:
            out.append(Image.open(p).convert("RGB"))
        except Exception:
            out.append(blank)
    return out

def encode_images(ids, save_path, batch_size=96):
    save_path = Path(save_path)
    if save_path.exists():
        arr = np.load(save_path, mmap_mode="r")
        if arr.shape[0] == len(ids):
            print("Loaded cached:", save_path)
            return np.array(arr)
        print("Cache size mismatch; recomputing:", save_path)
    imgs_all = load_imgs(ids)
    t0 = time.perf_counter()
    vec = model.encode(
        imgs_all, batch_size=batch_size, show_progress_bar=True,
        convert_to_numpy=True, normalize_embeddings=True
    )
    print(f"Full encode throughput: {len(imgs_all)/(time.perf_counter()-t0):.1f} img/s")
    np.save(save_path, vec)
    print("Saved:", save_path, vec.shape)
    return vec

tr_ids = train["sample_id"].tolist()
te_ids = test["sample_id"].tolist()

tr_img_vec = encode_images(tr_ids, EMB/"train_clip_img.npy", batch_size=96)
te_img_vec = encode_images(te_ids, EMB/"test_clip_img.npy",  batch_size=96)
print("Done. train/test img emb shapes:", tr_img_vec.shape, te_img_vec.shape)


ROOT: d:\amazon ML challenge
DATA: d:\amazon ML challenge\data
Loaded train/test: (75000, 4) (75000, 3)
Candidate: d:\amazon ML challenge\images_dl jpg: 146588
Candidate: d:\amazon ML challenge\images jpg: 110
Candidate: d:\amazon ML challenge\data\images jpg: 0
Candidate: d:\amazon ML challenge\notebooks\images_dl jpg: 0
Using IMG_DIR: D:\amazon ML challenge\images_dl
GPU: NVIDIA GeForce RTX 4070 Laptop GPU | Torch: 2.6.0+cu124
Found 256/256 images for quick test in images_dl
Timing encode on 128 imgs ...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Batches: 100%|██████████| 2/2 [00:05<00:00,  2.55s/it]


Throughput: 25.1 img/s (GPU OK)


In [1]:
# === Encode CLIP image embeddings with live progress + resume ===
import time, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import open_clip

# ---- CONFIG (edit only these if needed) ----
ROOT = Path(r"D:\amazon ML challenge")
DATA = ROOT / "data"
IMG_DIR = Path(r"D:\amazon ML challenge\images_dl")
ART  = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)
EMB  = ART / "emb_cache";  EMB.mkdir(parents=True, exist_ok=True)
BATCH_SIZE   = 256
NUM_WORKERS  = 6
CHUNK_SIZE   = 10_000  # write partial every 10k rows
MODEL_NAME   = "ViT-B-32"
PRETRAINED   = "laion2b_s34b_b79k"
# -------------------------------------------

assert torch.cuda.is_available(), "CUDA not available in this kernel/venv."
device = "cuda"
print("GPU:", torch.cuda.get_device_name(0), "| Torch:", torch.__version__, flush=True)

# Load ids
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")
tr_ids = train["sample_id"].tolist()
te_ids = test["sample_id"].tolist()
print(f"IDs loaded: train={len(tr_ids)}, test={len(te_ids)}", flush=True)

# Model + preprocess (fp16)
torch.backends.cudnn.benchmark = True
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED, device=device)
model.eval()
model = model.to(dtype=torch.float16)
print("Model ready:", MODEL_NAME, PRETRAINED, flush=True)

# Dataset
class ImgDS(Dataset):
    def __init__(self, ids, img_dir, preprocess, start=0):
        self.ids = ids[start:]
        self.dir = Path(img_dir)
        self.preprocess = preprocess
        self.blank = Image.new("RGB", (224,224), color=0)
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        sid = self.ids[i]
        p = self.dir / f"{sid}.jpg"
        try:
            img = Image.open(p).convert("RGB")
        except Exception:
            img = self.blank
        return self.preprocess(img), sid

def encode_split_progress(ids, save_path, model, preprocess, img_dir,
                          batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, chunk_size=CHUNK_SIZE):
    save_path = Path(save_path)
    tmp_path  = save_path.with_suffix(".partial.npy")

    # Resume if partial exists
    start_idx = 0
    partial = None
    if tmp_path.exists():
        partial = np.load(tmp_path, mmap_mode="r")
        start_idx = partial.shape[0]
        print(f"[resume] {tmp_path.name}: {start_idx}/{len(ids)} rows", flush=True)

    # If full exists and matches length, return it
    if save_path.exists():
        arr = np.load(save_path, mmap_mode="r")
        if arr.shape[0] == len(ids):
            print(f"[cache] Loaded {save_path.name}: {arr.shape}", flush=True)
            return np.array(arr)

    ds = ImgDS(ids, img_dir, preprocess, start=start_idx)
    if len(ds) == 0:
        if partial is not None:
            np.save(save_path, np.array(partial))
            tmp_path.unlink(missing_ok=True)
            print(f"[finalize] Saved {save_path.name} {partial.shape}", flush=True)
            return np.array(partial)
        else:
            raise RuntimeError("No samples to encode.")

    dl = DataLoader(
        ds, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True,
        persistent_workers=True
    )

    embs = []
    processed = start_idx
    t0 = time.perf_counter()
    last = t0
    print(f"[start] {save_path.name} | total={len(ids)} | bs={batch_size} | workers={num_workers}", flush=True)

    with torch.inference_mode():
        for bi, (xb, _) in enumerate(dl, 1):
            xb = xb.to(device, non_blocking=True).to(dtype=torch.float16)
            z  = model.encode_image(xb)
            z  = torch.nn.functional.normalize(z.float(), dim=1)
            embs.append(z.cpu())
            processed += xb.size(0)

            # progress line every ~5 batches
            if bi % 5 == 0:
                now = time.perf_counter()
                inst = (xb.size(0)*5) / (now - last)
                overall = processed / (now - t0)
                last = now
                print(f"[{processed:6d}/{len(ids)}] ~{int(overall)} img/s (inst ~{int(inst)})", flush=True)

            # periodic checkpoint
            if processed % chunk_size == 0:
                E_chunk = torch.cat(embs).numpy()
                if partial is not None:
                    E_chunk = np.vstack([partial, E_chunk])
                np.save(tmp_path, E_chunk)
                print(f"[ckpt] {tmp_path.name} -> {E_chunk.shape}", flush=True)
                embs, partial = [], E_chunk  # reset buffer

    # finalize
    E = torch.cat(embs).numpy() if embs else np.empty((0, 512), np.float32)
    if partial is not None:
        E = np.vstack([partial, E])
    np.save(save_path, E)
    tmp_path.unlink(missing_ok=True)
    dt = time.perf_counter() - t0
    print(f"[done] {save_path.name} {E.shape} | ~{int(len(ids)/dt)} img/s", flush=True)
    return E

# ---- CALL THE FUNCTION (this actually runs it) ----
train_out = EMB / "train_clip_openclip.npy"
test_out  = EMB / "test_clip_openclip.npy"

tr_img_vec = encode_split_progress(tr_ids, train_out, model, preprocess, IMG_DIR)
te_img_vec = encode_split_progress(te_ids,  test_out,  model, preprocess, IMG_DIR)

print("Embeddings ready:", tr_img_vec.shape, te_img_vec.shape, flush=True)


  from .autonotebook import tqdm as notebook_tqdm


GPU: NVIDIA GeForce RTX 4070 Laptop GPU | Torch: 2.6.0+cu124
IDs loaded: train=75000, test=75000
Model ready: ViT-B-32 laion2b_s34b_b79k
[cache] Loaded train_clip_openclip.npy: (75000, 512)
[cache] Loaded test_clip_openclip.npy: (75000, 512)
Embeddings ready: (75000, 512) (75000, 512)


In [2]:
# Rebuild dataset + model (safe on Windows)
import torch, time
from pathlib import Path
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import open_clip

ROOT = Path(r"D:\amazon ML challenge")
DATA = ROOT / "data"
IMG_DIR = Path(r"D:\amazon ML challenge\images_dl")
ART = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)
EMB = ART / "emb_cache"; EMB.mkdir(parents=True, exist_ok=True)

assert torch.cuda.is_available(), "CUDA not available"
device = "cuda"
print("GPU:", torch.cuda.get_device_name(0), "| torch:", torch.__version__)

train = pd.read_csv(DATA/"train.csv"); test = pd.read_csv(DATA/"test.csv")
tr_ids = train["sample_id"].tolist(); te_ids = test["sample_id"].tolist()
print("IDs:", len(tr_ids), len(te_ids))

torch.backends.cudnn.benchmark = True
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k", device=device
)
model.eval(); model = model.to(dtype=torch.float16)

class ImgDS(Dataset):
    def __init__(self, ids, img_dir, preprocess, start=0):
        self.ids = ids[start:]; self.dir = Path(img_dir)
        self.preprocess = preprocess; self.blank = Image.new("RGB",(224,224),color=0)
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        sid = self.ids[i]; p = self.dir / f"{sid}.jpg"
        try: img = Image.open(p).convert("RGB")
        except Exception: img = self.blank
        return self.preprocess(img), sid


GPU: NVIDIA GeForce RTX 4070 Laptop GPU | torch: 2.6.0+cu124
IDs: 75000 75000


In [3]:
from torch.utils.data import DataLoader
ds_dbg = ImgDS(tr_ids, IMG_DIR, preprocess, start=0)
dl_dbg = DataLoader(ds_dbg, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
import time
t0 = time.perf_counter()
xb, _ = next(iter(dl_dbg))
print("Loaded batch:", xb.shape, "in", round(time.perf_counter()-t0,2), "s")
xb = xb.to("cuda", non_blocking=True).to(dtype=torch.float16)
with torch.inference_mode(): z = model.encode_image(xb)
print("Encoded OK, emb:", z.shape)


Loaded batch: torch.Size([64, 3, 224, 224]) in 1.73 s
Encoded OK, emb: torch.Size([64, 512])


In [4]:
# === Reliable OpenCLIP encoder (Windows-safe, resumable, frequent checkpoints) ===
import time, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import open_clip

# ---- Paths (your project) ----
ROOT = Path(r"D:\amazon ML challenge")
DATA = ROOT / "data"
IMG_DIR = Path(r"D:\amazon ML challenge\images_dl")
ART  = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)
EMB  = ART / "emb_cache";  EMB.mkdir(parents=True, exist_ok=True)

# ---- Load data ----
train = pd.read_csv(DATA/"train.csv"); test  = pd.read_csv(DATA/"test.csv")
tr_ids = train["sample_id"].tolist(); te_ids = test["sample_id"].tolist()

# ---- GPU + model ----
assert torch.cuda.is_available(), "CUDA not available in this kernel/venv."
device = "cuda"; torch.backends.cudnn.benchmark = True
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k", device=device)
model.eval(); model = model.to(dtype=torch.float16)
print("GPU:", torch.cuda.get_device_name(0), "| Torch:", torch.__version__)

# ---- Dataset ----
class ImgDS(Dataset):
    def __init__(self, ids, img_dir, preprocess, start=0):
        self.ids = ids[start:]; self.dir = Path(img_dir)
        self.preprocess = preprocess; self.blank = Image.new("RGB",(224,224),color=0)
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        sid = self.ids[i]; p = self.dir / f"{sid}.jpg"
        try: img = Image.open(p).convert("RGB")
        except Exception: img = self.blank
        return self.preprocess(img), sid

# ---- Resumable, frequent-checkpoint encoder (workers=0 to avoid Windows hangs) ----
from torch.utils.data import DataLoader

def encode_split_solo(ids, out_path, batch_size=256, chunk=2_000):
    out_path = Path(out_path); tmp = out_path.with_suffix(".partial.npy")

    # resume if partial exists
    start = 0; part = None
    if tmp.exists():
        part = np.load(tmp, mmap_mode="r"); start = part.shape[0]
        print(f"[resume] {tmp.name}: {start}/{len(ids)}")

    # if full exists, return
    if out_path.exists():
        arr = np.load(out_path, mmap_mode="r")
        if arr.shape[0] == len(ids):
            print(f"[cache] {out_path.name}: {arr.shape}"); return np.array(arr)

    ds = ImgDS(ids, IMG_DIR, preprocess, start=start)
    if len(ds)==0:
        if part is not None:
            np.save(out_path, np.array(part)); tmp.unlink(missing_ok=True)
            print(f"[finalize] {out_path.name} {part.shape}"); return np.array(part)
        raise RuntimeError("No samples to encode.")

    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    embs=[]; done=start; t0=time.perf_counter(); last=t0
    print(f"[start] {out_path.name} total={len(ids)} bs={batch_size} workers=0")

    with torch.inference_mode():
        for i,(xb,_) in enumerate(dl,1):
            xb = xb.to("cuda", non_blocking=True).to(dtype=torch.float16)
            z = model.encode_image(xb)
            z = torch.nn.functional.normalize(z.float(), dim=1)
            embs.append(z.cpu()); done += xb.size(0)

            if i%5==0:
                now=time.perf_counter()
                inst = (xb.size(0)*5)/(now-last); overall = done/(now-t0); last=now
                print(f"[{done:6d}/{len(ids)}] ~{int(overall)} img/s (inst ~{int(inst)})", flush=True)

            if done%chunk==0:
                E = torch.cat(embs).numpy()
                if part is not None: E = np.vstack([part,E])
                np.save(tmp, E); print(f"[ckpt] {tmp.name} -> {E.shape}", flush=True)
                embs=[]; part=E

    E = torch.cat(embs).numpy() if embs else np.empty((0,512),np.float32)
    if part is not None: E = np.vstack([part,E])
    np.save(out_path, E); tmp.unlink(missing_ok=True)
    dt=time.perf_counter()-t0
    print(f"[done] {out_path.name} {E.shape} ~{int(len(ids)/dt)} img/s"); return E

# ---- RUN (resumable; prints progress quickly) ----
tr_emb = encode_split_solo(tr_ids, EMB/"train_clip_openclip.npy", batch_size=256, chunk=2_000)
te_emb = encode_split_solo(te_ids, EMB/"test_clip_openclip.npy",  batch_size=256, chunk=2_000)
print("Embeddings:", tr_emb.shape, te_emb.shape)


GPU: NVIDIA GeForce RTX 4070 Laptop GPU | Torch: 2.6.0+cu124
[cache] train_clip_openclip.npy: (75000, 512)
[cache] test_clip_openclip.npy: (75000, 512)
Embeddings: (75000, 512) (75000, 512)


In [5]:
from sklearn.linear_model import Ridge
import numpy as np, pandas as pd
from pathlib import Path

ART = Path(r"D:\amazon ML challenge\artifacts")

y = train["price"].to_numpy(float)
ridge_i = Ridge(alpha=1.0, random_state=42).fit(tr_emb, y)
pred_img = np.clip(ridge_i.predict(te_emb), 1e-6, None)

pd.DataFrame({"sample_id": test["sample_id"], "price": pred_img}).to_csv(ART/"submission_img_clip.csv", index=False)
print("Saved:", (ART/"submission_img_clip.csv").resolve())

Saved: D:\amazon ML challenge\artifacts\submission_img_clip.csv


In [6]:
import pandas as pd
from pathlib import Path

ART = Path(r"D:\amazon ML challenge\artifacts")
df_txt = pd.read_csv(ART/"submission_wc_log.csv")      # your best log-target text model
df_img = pd.read_csv(ART/"submission_img_clip.csv")    # just created

m = df_txt.merge(df_img, on="sample_id", suffixes=("_txt","_img"))
m["price"] = 0.9*m["price_txt"] + 0.1*m["price_img"]   # conservative blend
m[["sample_id","price"]].to_csv(ART/"submission_ensemble_img_v1.csv", index=False)
print("Saved:", (ART/"submission_ensemble_img_v1.csv").resolve())


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_img_v1.csv


In [7]:
import pandas as pd
from pathlib import Path

ROOT = Path(r"D:\amazon ML challenge")
ART, DATA = ROOT/"artifacts", ROOT/"data"

base = "submission_ensemble_img_v1.csv"  # swap to "submission_wc_log.csv" if you skip images
df = pd.read_csv(ART/base)[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

order = pd.read_csv(DATA/"test.csv")["sample_id"]
final = order.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()
final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [8]:
for w in (0.95, 0.9, 0.85, 0.8):
    out = ART/f"submission_blend_text{int(w*100)}_img{int((1-w)*100)}.csv"
    tmp = m.copy()
    tmp["price"] = w*tmp["price_txt"] + (1-w)*tmp["price_img"]
    tmp[["sample_id","price"]].to_csv(out, index=False)
    print("Saved:", out.name)


Saved: submission_blend_text95_img5.csv
Saved: submission_blend_text90_img9.csv
Saved: submission_blend_text85_img15.csv
Saved: submission_blend_text80_img19.csv


In [9]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(r"D:\amazon ML challenge")
ART, DATA = ROOT/"artifacts", ROOT/"data"

# Load predictions (choose one)
pred_path = ART/"submission_ensemble_img_v1.csv"   # or ART/"submission_wc_log.csv"
pred = pd.read_csv(pred_path, dtype={"sample_id": int, "price": float})

# Load official test IDs and align EXACTLY
test_ids = pd.read_csv(DATA/"test.csv", usecols=["sample_id"], dtype={"sample_id": int})
out = test_ids.merge(pred, on="sample_id", how="left")

# Coerce numeric, clip, and format
out["price"] = pd.to_numeric(out["price"], errors="coerce").fillna(0).clip(lower=1e-6)

# Final write: EXACT TWO COLUMNS, NO INDEX, COMMA delimiter, UTF-8, fixed decimals
final_path = ART/"test_out.csv"
out[["sample_id","price"]].to_csv(final_path, index=False, encoding="utf-8", float_format="%.6f")
print("WROTE:", final_path.resolve(), "| rows:", len(out))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [10]:
from pathlib import Path
import pandas as pd
import numpy as np

ROOT = Path(r"D:\amazon ML challenge")
ART, DATA = ROOT/"artifacts", ROOT/"data"

# Load predictions (choose one)
pred_path = ART/"submission_ensemble_img_v1.csv"   # or ART/"submission_wc_log.csv"
pred = pd.read_csv(pred_path, dtype={"sample_id": int, "price": float})

# Load official test IDs and align EXACTLY
test_ids = pd.read_csv(DATA/"test.csv", usecols=["sample_id"], dtype={"sample_id": int})
out = test_ids.merge(pred, on="sample_id", how="left")

# Coerce numeric, clip, and format
out["price"] = pd.to_numeric(out["price"], errors="coerce").fillna(0).clip(lower=1e-6)

# Final write: EXACT TWO COLUMNS, NO INDEX, COMMA delimiter, UTF-8, fixed decimals
final_path = ART/"test_out.csv"
out[["sample_id","price"]].to_csv(final_path, index=False, encoding="utf-8", float_format="%.6f")
print("WROTE:", final_path.resolve(), "| rows:", len(out))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [11]:
import pandas as pd
import numpy as np

df = pd.read_csv(final_path, dtype={"sample_id": int, "price": float})

assert list(df.columns)==["sample_id","price"], "Headers must be exactly: sample_id,price"
assert len(df)==75000, f"Expected 75,000 rows, got {len(df)}"
assert df["sample_id"].is_unique, "sample_id must be unique"
assert df["price"].notna().all(), "price contains NaNs"
assert (df["price"]>0).all(), "price must be positive"
# Optional: ensure IDs match test exactly (order doesn’t matter)
ids_expected = set(test_ids["sample_id"])
ids_actual   = set(df["sample_id"])
assert ids_actual==ids_expected, f"sample_id mismatch: diff={len(ids_expected.symmetric_difference(ids_actual))}"
print("Validation ✅ — good to upload.")


Validation ✅ — good to upload.


In [13]:
# ==== fit_all ====
import numpy as np, pandas as pd, re, os, warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import Ridge
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb

# ---------- Utils ----------
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / np.clip(denom, 1e-9, None)
    return np.mean(diff) * 100.0

def to_log(y):  # safe log for positive prices
    return np.log1p(np.maximum(y, 1e-6))

def from_log(yhat):
    return np.expm1(yhat)

rng = np.random.RandomState(42)

# ---------- Inputs / paths ----------
ART, DATA = Path("artifacts"), Path("data")
ART.mkdir(exist_ok=True, parents=True)

# Basic checks
assert {"sample_id","price"}.issubset(set(train.columns)), "train must have sample_id, price"
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
assert TEXT_COL in train.columns and TEXT_COL in test.columns, f"Missing {TEXT_COL} in train/test"

# ---------- Text meta features ----------
_UNIT_MAP = {"kg": 1000.0, "g": 1.0, "l": 1000.0, "ml": 1.0, "litre": 1000.0, "liter": 1000.0}
_QTY_RE = re.compile(r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|g|kg)\b', flags=re.I)
_PACK_RE = re.compile(r'(?:pack\s*of|x|×)\s*(\d{1,3})', flags=re.I)
_NUMTOK_RE = re.compile(r'\d+')

def extract_meta(s: str):
    if not isinstance(s, str): s = ""
    s_l = s.lower()

    # pack size
    pack = None
    m = _PACK_RE.search(s_l)
    if m:
        try: pack = int(m.group(1))
        except: pack = None

    # quantity (convert to base: g or ml)
    qty = 0.0
    for q, u in _QTY_RE.findall(s_l):
        q = float(q); u = u.lower()
        if u in ("kg","g"):
            qty += q * (1000.0 if u == "kg" else 1.0)   # grams base
        else:
            qty += q * (1000.0 if u in ("l","liter","litre") else 1.0)  # ml base

    # simple stats
    n_chars = len(s_l)
    n_spaces = s_l.count(" ")
    n_digits = len(_NUMTOK_RE.findall(s_l))
    avg_tok_len = n_chars / (n_spaces + 1)

    # naive brand
    brand = re.sub(r'[^a-z0-9]+', '', s_l.strip().split(' ')[0])[:20]
    has_percent = ('%' in s_l)

    return pd.Series({
        "meta_pack": pack if pack is not None else 1,
        "meta_qty": qty,
        "meta_len": n_chars,
        "meta_digits": n_digits,
        "meta_avg_tok_len": avg_tok_len,
        "meta_has_pct": int(has_percent),
        "meta_brand": brand
    })

def build_meta(df, text_col=TEXT_COL):
    M = df[text_col].apply(extract_meta)
    topb = M["meta_brand"].value_counts().head(100).index
    M["meta_brand_top"] = np.where(M["meta_brand"].isin(topb), M["meta_brand"], "other")
    M = pd.get_dummies(M, columns=["meta_brand_top"], drop_first=True)
    for c in ["meta_qty","meta_len","meta_digits","meta_avg_tok_len","meta_pack"]:
        M[c] = np.log1p(M[c].astype(float))
    return M

# ---------- 1) Strong TF-IDF Ridge on log(price) ----------
vec_word = TfidfVectorizer(
    ngram_range=(1,2), max_features=300_000,
    min_df=3, max_df=0.9, sublinear_tf=True
)
vec_char = TfidfVectorizer(
    analyzer="char_wb", ngram_range=(3,6),
    max_features=300_000, min_df=3, sublinear_tf=True
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_log = to_log(train["price"].values)
oof_log_text = np.zeros(len(train))
preds_text_log_test = np.zeros(len(test))

for tr, va in kf.split(train):
    Xtr = train.iloc[tr][TEXT_COL].fillna("")
    Xva = train.iloc[va][TEXT_COL].fillna("")
    ytr, yva = y_log[tr], y_log[va]

    uni = FeatureUnion([("w", vec_word), ("c", vec_char)])
    Xtrv = uni.fit_transform(Xtr)
    Xvav = uni.transform(Xva)

    model = Ridge(alpha=1.1, random_state=42)
    model.fit(Xtrv, ytr)
    oof_log_text[va] = model.predict(Xvav)

    preds_text_log_test += model.predict(uni.transform(test[TEXT_COL].fillna("")))

preds_text_log_test /= kf.get_n_splits()
text_test_pred = from_log(preds_text_log_test)
print("OOF SMAPE (Text Ridge):", smape(train["price"].values, from_log(oof_log_text)))

# ---------- 2) Dense model: TF-IDF→SVD + Meta + (optional) Image → LightGBM ----------
# 2.1 Word TF-IDF for SVD
vec_word_svd = TfidfVectorizer(ngram_range=(1,2), max_features=300_000, min_df=3, max_df=0.95, sublinear_tf=True)
Xw_tr = vec_word_svd.fit_transform(train[TEXT_COL].fillna(""))
Xw_te = vec_word_svd.transform(test[TEXT_COL].fillna(""))

svd = TruncatedSVD(n_components=256, random_state=42)
Xw_tr_svd = svd.fit_transform(Xw_tr)
Xw_te_svd = svd.transform(Xw_te)

# 2.2 Meta features
M_tr = build_meta(train, TEXT_COL)
M_te = build_meta(test, TEXT_COL)

# 2.3 Image embeddings (if provided) → PCA to 64
use_image = ("tr_emb" in globals()) and ("te_emb" in globals()) and (tr_emb is not None) and (te_emb is not None)
if use_image:
    pca = PCA(n_components=64, random_state=42)
    img_tr_64 = pca.fit_transform(tr_emb)
    img_te_64 = pca.transform(te_emb)
else:
    img_tr_64 = np.zeros((len(train), 0))
    img_te_64 = np.zeros((len(test), 0))

# 2.4 Assemble dense matrices
Xtr_dense = np.hstack([Xw_tr_svd, M_tr.to_numpy(), img_tr_64])
Xte_dense = np.hstack([Xw_te_svd, M_te.to_numpy(), img_te_64])

# 2.5 LightGBM on log(price)
params = dict(
    objective="regression",
    metric="mae",
    learning_rate=0.05,
    num_leaves=64,
    min_data_in_leaf=50,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    n_estimators=4000,
    verbose=-1
)

oof_log_gbm = np.zeros(len(train))
gbm_test_log = np.zeros(len(test))

for tr, va in kf.split(Xtr_dense):
    dtr = lgb.Dataset(Xtr_dense[tr], label=y_log[tr])
    dva = lgb.Dataset(Xtr_dense[va], label=y_log[va])
    model = lgb.train(
        params, dtr, valid_sets=[dva], num_boost_round=4000,
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    oof_log_gbm[va] = model.predict(Xtr_dense[va])
    gbm_test_log += model.predict(Xte_dense)

gbm_test_log /= kf.get_n_splits()
dense_test_pred = from_log(gbm_test_log)
print("OOF SMAPE (Dense LightGBM):", smape(train["price"].values, from_log(oof_log_gbm)))

# ---------- 3) Price-band experts (test-time only unless you add OOF) ----------
lp = y_log
q1, q2 = np.quantile(lp, [0.33, 0.66])
m_lo = lp <= q1
m_md = (lp > q1) & (lp <= q2)
m_hi = lp > q2

def fit_text_ridge_subset(X, y, alpha=1.1):
    v_w = TfidfVectorizer(ngram_range=(1,2), max_features=250_000, min_df=3, sublinear_tf=True)
    v_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=250_000, min_df=3, sublinear_tf=True)
    uni = FeatureUnion([("w", v_w), ("c", v_c)])
    Xv = uni.fit_transform(X)
    mdl = Ridge(alpha=alpha, random_state=42).fit(Xv, y)
    return uni, mdl

Xtxt = train[TEXT_COL].fillna("")
uni_lo, mdl_lo = fit_text_ridge_subset(Xtxt[m_lo], lp[m_lo])
uni_md, mdl_md = fit_text_ridge_subset(Xtxt[m_md], lp[m_md])
uni_hi, mdl_hi = fit_text_ridge_subset(Xtxt[m_hi], lp[m_hi])

# soft gating by distance to cluster centers (log-space)
c_lo, c_md, c_hi = lp[m_lo].mean(), lp[m_md].mean(), lp[m_hi].mean()
def soft_weights(lv):
    d = np.stack([np.abs(lv - c_lo), np.abs(lv - c_md), np.abs(lv - c_hi)], 1)
    w = 1/(d+1e-6); w = w / w.sum(1, keepdims=True)
    return w

Xtest_txt = test[TEXT_COL].fillna("")
p_lo = mdl_lo.predict(uni_lo.transform(Xtest_txt))
p_md = mdl_md.predict(uni_md.transform(Xtest_txt))
p_hi = mdl_hi.predict(uni_hi.transform(Xtest_txt))

log_anchor = np.log1p(np.maximum(text_test_pred, 1e-6))
W = soft_weights(log_anchor)
moe_log = W[:,0]*p_lo + W[:,1]*p_md + W[:,2]*p_hi
experts_test_pred = from_log(moe_log)

print("fit_all done. Saved vars: oof_log_text, oof_log_gbm, text_test_pred, dense_test_pred, experts_test_pred, y_log")


KeyboardInterrupt: 

In [14]:
# ---------- 2.5 CatBoost (GPU) on log(price) ----------
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold

# CatBoost params tuned for fast, strong GPU training
cb_params = dict(
    loss_function="MAE",        # training on log(price), MAE behaves close to SMAPE in log-space
    eval_metric="MAE",
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3.0,
    iterations=6000,            # early stopping will cut this
    random_seed=42,
    task_type="GPU",            # <-- USE GPU
    devices="0",                # your 4070
    bootstrap_type="Bayesian",
    subsample=0.8,
    rsm=0.8,                    # feature subsampling
    verbose=False
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_gbm = np.zeros(len(train))   # keep the same variable name so the rest of your notebook works
gbm_test_log = np.zeros(len(test))

for tr_idx, va_idx in kf.split(Xtr_dense):
    tr_pool = Pool(Xtr_dense[tr_idx], label=y_log[tr_idx])
    va_pool = Pool(Xtr_dense[va_idx], label=y_log[va_idx])

    model = CatBoostRegressor(**cb_params)
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True, early_stopping_rounds=200, verbose=False)

    oof_log_gbm[va_idx] = model.predict(va_pool)
    gbm_test_log += model.predict(Pool(Xte_dense))

gbm_test_log /= kf.get_n_splits()
dense_test_pred = from_log(gbm_test_log)
print("OOF SMAPE (Dense CatBoost GPU):", smape(train['price'].values, from_log(oof_log_gbm)))


ImportError: DLL load failed while importing _catboost: The specified module could not be found.

In [15]:
# ---------- 2.5 XGBoost (GPU) on log(price) ----------
import xgboost as xgb
from sklearn.model_selection import KFold

xgb_params = dict(
    objective="reg:squarederror",   # we'll optimize early-stopping with MAE metric
    tree_method="gpu_hist",         # <-- GPU
    predictor="gpu_predictor",
    eval_metric="mae",
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    n_estimators=6000
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_gbm = np.zeros(len(train))
gbm_test_log = np.zeros(len(test))

for tr_idx, va_idx in kf.split(Xtr_dense):
    dtr = xgb.DMatrix(Xtr_dense[tr_idx], label=y_log[tr_idx])
    dva = xgb.DMatrix(Xtr_dense[va_idx], label=y_log[va_idx])
    dte = xgb.DMatrix(Xte_dense)

    model = xgb.train(
        xgb_params, dtr,
        num_boost_round=6000,
        evals=[(dva, "valid")],
        early_stopping_rounds=200,
        verbose_eval=False
    )
    oof_log_gbm[va_idx] = model.predict(dva, iteration_range=(0, model.best_iteration+1))
    gbm_test_log += model.predict(dte, iteration_range=(0, model.best_iteration+1))

gbm_test_log /= kf.get_n_splits()
dense_test_pred = from_log(gbm_test_log)
print("OOF SMAPE (Dense XGBoost GPU):", smape(train["price"].values, from_log(oof_log_gbm)))


NameError: name 'Xtr_dense' is not defined

In [28]:
# ==== Price-band experts with OOF (word+char TF-IDF Ridge on log price) ====
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from scipy.sparse import hstack

def to_log(y): return np.log1p(np.maximum(y, 1e-6))
def from_log(yhat): return np.expm1(yhat)
def smape(y_true,y_pred):
    y_true=np.asarray(y_true,float); y_pred=np.asarray(y_pred,float)
    denom=(np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom,1e-9,None))*100

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
tr_txt = train[TEXT_COL].fillna("").astype(str)
te_txt = test[TEXT_COL].fillna("").astype(str)

# shared TF-IDF space for all experts (saves time)
vec_w = TfidfVectorizer(ngram_range=(1,2), max_features=250_000, min_df=3, max_df=0.9, sublinear_tf=True)
vec_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=250_000, min_df=3, sublinear_tf=True)
Xw = vec_w.fit_transform(tr_txt); Xw_te = vec_w.transform(te_txt)
Xc = vec_c.fit_transform(tr_txt); Xc_te = vec_c.transform(te_txt)
X  = hstack([Xw, Xc], format="csr"); Xte = hstack([Xw_te, Xc_te], format="csr")

lp = to_log(train["price"].values)
q1, q2 = np.quantile(lp, [0.33, 0.66])

def mask_bins(y):
    return (y<=q1), ((y>q1)&(y<=q2)), (y>q2)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_experts = np.zeros(len(train))
experts_log_test = np.zeros(len(test))

for tr_idx, va_idx in kf.split(X):
    ytr = lp[tr_idx]
    m_lo, m_md, m_hi = mask_bins(ytr)

    # train three subset Ridges
    r_lo = Ridge(alpha=1.1).fit(X[tr_idx][m_lo], ytr[m_lo])
    r_md = Ridge(alpha=1.1).fit(X[tr_idx][m_md], ytr[m_md])
    r_hi = Ridge(alpha=1.1).fit(X[tr_idx][m_hi], ytr[m_hi])

    # gating by distance to bin means (using a quick anchor from global text oof/train)
    c_lo, c_md, c_hi = ytr[m_lo].mean(), ytr[m_md].mean(), ytr[m_hi].mean()

    # predict on fold valid & test
    p_lo_va = r_lo.predict(X[va_idx]); p_md_va = r_md.predict(X[va_idx]); p_hi_va = r_hi.predict(X[va_idx])
    p_lo_te = r_lo.predict(Xte);      p_md_te = r_md.predict(Xte);      p_hi_te = r_hi.predict(Xte)

    # anchor: use global text oof for va; for test use text_test_pred (already computed)
    anchor_va = np.log1p(np.maximum(from_log(oof_log_text[va_idx]),1e-6))
    anchor_te = np.log1p(np.maximum(text_test_pred,1e-6))

    def soft_weights(anchor, c_lo, c_md, c_hi):
        d = np.stack([np.abs(anchor-c_lo), np.abs(anchor-c_md), np.abs(anchor-c_hi)], 1)
        w = 1/(d+1e-6); w /= w.sum(1, keepdims=True); return w

    W_va = soft_weights(anchor_va, c_lo, c_md, c_hi)
    W_te = soft_weights(anchor_te, c_lo, c_md, c_hi)

    oof_log_experts[va_idx] = W_va[:,0]*p_lo_va + W_va[:,1]*p_md_va + W_va[:,2]*p_hi_va
    experts_log_test       += W_te[:,0]*p_lo_te + W_te[:,1]*p_md_te + W_te[:,2]*p_hi_te

experts_log_test /= kf.get_n_splits()
experts_test_pred = from_log(experts_log_test)
print("OOF SMAPE (Experts):", smape(train["price"].values, from_log(oof_log_experts)))

# Re-blend with experts OOF included
oof_text  = from_log(oof_log_text)
oof_dense = from_log(oof_log_gbm)
oof_expt  = from_log(oof_log_experts)

cands=[]
for wt in np.linspace(0.40,0.80,9):
    for wd in np.linspace(0.10,0.45,8):
        we = 1.0 - wt - wd
        if we>=0:
            pred = wt*oof_text + wd*oof_dense + we*oof_expt
            cands.append((smape(train["price"].values, pred), wt, wd, we))
best = min(cands, key=lambda x: x[0])
bsm, WT, WD, WE = best
print(f"New OOF blend with experts → SMAPE={bsm:.4f}, WT={WT:.3f}, WD={WD:.3f}, WE={WE:.3f}")

final_test_pred = WT*text_test_pred + WD*dense_test_pred + WE*experts_test_pred


OOF SMAPE (Experts): 54.651830986604956
New OOF blend with experts → SMAPE=50.0979, WT=0.500, WD=0.450, WE=0.050


In [30]:
# ==== Rebuild dense features with word+char TF-IDF -> SVD(320) ====
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import numpy as np, re, pandas as pd

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
tr_txt = train[TEXT_COL].fillna("").astype(str); te_txt = test[TEXT_COL].fillna("").astype(str)

# word + char TF-IDF
vec_w = TfidfVectorizer(ngram_range=(1,2), max_features=200_000, min_df=3, max_df=0.95, sublinear_tf=True)
vec_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=100_000, min_df=3, sublinear_tf=True)
Xw_tr = vec_w.fit_transform(tr_txt); Xw_te = vec_w.transform(te_txt)
Xc_tr = vec_c.fit_transform(tr_txt); Xc_te = vec_c.transform(te_txt)

from scipy.sparse import hstack
X_tr = hstack([Xw_tr, Xc_tr], format="csr")
X_te = hstack([Xw_te, Xc_te], format="csr")

svd = TruncatedSVD(n_components=320, random_state=42)  # bump to 320
Xw_tr_svd = svd.fit_transform(X_tr)
Xw_te_svd = svd.transform(X_te)

# reuse your meta + (optional) image PCA from earlier run if in memory:
# if not, rebuild quickly (same as before)
def _meta_extract(df):
    _Q=re.compile(r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|g|kg)\b', re.I)
    _P=re.compile(r'(?:pack\s*of|x|×)\s*(\d{1,3})', re.I)
    _D=re.compile(r'\d+')
    out=[]
    for s in df[TEXT_COL].fillna("").astype(str):
        s_l=s.lower(); pack=1; m=_P.search(s_l)
        if m:
            try: pack=int(m.group(1))
            except: pack=1
        qty=0.0
        for q,u in _Q.findall(s_l):
            q=float(q); u=u.lower()
            qty+= q*(1000.0 if (u in ("kg","l","liter","litre")) else 1.0)
        n_chars=len(s_l); n_spaces=s_l.count(" "); n_digits=len(_D.findall(s_l))
        avg_tok = n_chars/(n_spaces+1)
        brand = re.sub(r'[^a-z0-9]+','', s_l.strip().split(' ')[0])[:20]
        out.append((pack,qty,n_chars,n_digits,avg_tok,int('%' in s_l),brand))
    M=pd.DataFrame(out, columns=["meta_pack","meta_qty","meta_len","meta_digits","meta_avg_tok_len","meta_has_pct","meta_brand"])
    topb=M["meta_brand"].value_counts().head(100).index
    M["meta_brand_top"]=np.where(M["meta_brand"].isin(topb),M["meta_brand"],"other")
    M=M.drop(columns=["meta_brand"])
    M=pd.get_dummies(M, columns=["meta_brand_top"], drop_first=True)
    for c in ["meta_pack","meta_qty","meta_len","meta_digits","meta_avg_tok_len"]: M[c]=np.log1p(M[c].astype(float))
    return M

M_tr = _meta_extract(train); M_te = _meta_extract(test)
all_cols = sorted(set(M_tr.columns)|set(M_te.columns))
M_tr = M_tr.reindex(columns=all_cols, fill_value=0).to_numpy(np.float32)
M_te = M_te.reindex(columns=all_cols, fill_value=0).to_numpy(np.float32)

use_img = ('tr_emb' in globals()) and (tr_emb is not None) and ('te_emb' in globals()) and (te_emb is not None)
if use_img:
    pca = PCA(n_components=64, random_state=42)
    img_tr_64 = pca.fit_transform(tr_emb)
    img_te_64 = pca.transform(te_emb)
else:
    img_tr_64 = np.zeros((len(train),0), np.float32)
    img_te_64 = np.zeros((len(test),0),  np.float32)

Xtr_dense = np.hstack([Xw_tr_svd, M_tr, img_tr_64]).astype(np.float32)
Xte_dense = np.hstack([Xw_te_svd, M_te, img_te_64]).astype(np.float32)
print("Rebuilt dense:", Xtr_dense.shape, Xte_dense.shape)


Rebuilt dense: (75000, 390) (75000, 390)


In [29]:
# ==== Robust blend + submit (no dependency on data/test.csv) ====
import numpy as np, pandas as pd
from pathlib import Path

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0

def from_log(yhat): return np.expm1(yhat)

# sanity: required arrays from earlier steps
assert 'oof_log_text' in globals() and 'oof_log_gbm' in globals(), "Missing OOF arrays"
assert 'text_test_pred' in globals() and 'dense_test_pred' in globals(), "Missing test preds"

oof_text  = from_log(oof_log_text)
oof_dense = from_log(oof_log_gbm)

# OOF-tuned weights (recompute here so WT/WD exist even if you cleared state)
cands=[]
for wt in np.linspace(0.55, 0.95, 9):
    for wd in np.linspace(0.05, 0.45, 9):
        if wt+wd <= 1.0:
            pred = wt*oof_text + wd*oof_dense
            cands.append((smape(train["price"].values, pred), wt, wd))
best_smape, WT, WD = min(cands, key=lambda x: x[0])
WE = max(0.0, 1.0 - WT - WD)  # experts (if present)
WE = min(0.05, WE)            # cap to 5% without OOF

experts_part = (experts_test_pred if 'experts_test_pred' in globals() else 0.0)
final_test_pred = WT*text_test_pred + WD*dense_test_pred + WE*experts_part

# Write submission using in-memory test order when data/test.csv doesn't exist
ART = Path("artifacts"); ART.mkdir(parents=True, exist_ok=True)
order_path = Path("data")/"test.csv"
if order_path.exists():
    order = pd.read_csv(order_path, usecols=["sample_id"])
    print("Preserving order from data/test.csv")
else:
    order = test[["sample_id"]].copy()
    print("data/test.csv not found — using in-memory test order")

sub = pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final_test_pred, 1e-6, None)})
out = order.merge(sub, on="sample_id", how="left")

# validations
assert list(out.columns)==["sample_id","price"]
assert len(out)==len(order)==len(test)
assert out["price"].notna().all() and (out["price"]>0).all()

out_path = ART/"test_out.csv"
out.to_csv(out_path, index=False, float_format="%.6f")
print(f"✅ test_out.csv ready: {out_path.resolve()}")
print(f"Blend weights → Text:{WT:.3f} Dense:{WD:.3f} Experts:{WE:.3f} | OOF SMAPE≈{best_smape:.4f}")


data/test.csv not found — using in-memory test order
✅ test_out.csv ready: D:\amazon ML challenge\notebooks\artifacts\test_out.csv
Blend weights → Text:0.550 Dense:0.400 Experts:0.050 | OOF SMAPE≈49.8933


In [31]:
# ==== GPU ASSERT CELL: verify GPU + xgboost build ====
import os, sys, subprocess, json, importlib
import numpy as np

# Show GPU
try:
    print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout.splitlines()[0])
except Exception as e:
    print("nvidia-smi not found or not working:", e)

# XGBoost version
import xgboost as xgb
print("xgboost version:", xgb.__version__)

# Force this Python to use GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Build version-aware GPU params
from packaging import version
xgb_params = dict(
    objective="reg:squarederror",
    eval_metric="mae",
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    n_estimators=6000,
    verbosity=2,   # show logs stating CUDA usage
)

if version.parse(xgb.__version__) >= version.parse("2.0.0"):
    # v2+ prefers device='cuda' and tree_method='hist'
    xgb_params.update(device="cuda", tree_method="hist")
else:
    # v1.x uses gpu_hist/predictor
    xgb_params.update(tree_method="gpu_hist", predictor="gpu_predictor")

print("Using params:", xgb_params)


Mon Oct 13 19:53:00 2025       
xgboost version: 3.0.5
Using params: {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'learning_rate': 0.05, 'max_depth': 8, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_lambda': 1.0, 'n_estimators': 6000, 'verbosity': 2, 'device': 'cuda', 'tree_method': 'hist'}


In [32]:
# ==== Retrain XGBoost on GPU with verbose CUDA logs ====
import xgboost as xgb
import numpy as np
from sklearn.model_selection import KFold

def from_log(yhat): return np.expm1(yhat)
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0

# Safety: make sure dense matrices exist
assert "Xtr_dense" in globals() and "Xte_dense" in globals(), "Run the dense feature build cell first."
assert "y_log" in globals(), "y_log not found (log(price) target)."

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_gbm = np.zeros(len(train))
gbm_test_log = np.zeros(len(test))

for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr_dense), 1):
    dtr = xgb.DMatrix(Xtr_dense[tr_idx], label=y_log[tr_idx])
    dva = xgb.DMatrix(Xtr_dense[va_idx], label=y_log[va_idx])
    dte = xgb.DMatrix(Xte_dense)

    print(f"\n=== Fold {fold} training (expect CUDA / GPU logs below) ===")
    model = xgb.train(
        params=xgb_params,
        dtrain=dtr,
        num_boost_round=6000,
        evals=[(dva, "valid")],
        early_stopping_rounds=200,
        verbose_eval=True  # show logs; should mention GPU/CUDA
    )

    oof_log_gbm[va_idx] = model.predict(dva, iteration_range=(0, model.best_iteration + 1))
    gbm_test_log += model.predict(dte, iteration_range=(0, model.best_iteration + 1))

gbm_test_log /= kf.get_n_splits()
dense_test_pred = from_log(gbm_test_log)

print("\nOOF SMAPE (Dense XGBoost GPU):", smape(train["price"].values, from_log(oof_log_gbm)))
print("✅ Dense XGBoost GPU done.")



=== Fold 1 training (expect CUDA / GPU logs below) ===
[19:53:50] INFO: C:\actions-runner\_work\xgboost\xgboost\src\data\simple_dmatrix.cc:142: Generating new Ellpack page.
[19:53:50] INFO: C:\actions-runner\_work\xgboost\xgboost\src\data\ellpack_page.cu:167: Ellpack is dense.
[0]	valid-mae:0.76099
[1]	valid-mae:0.75204
[2]	valid-mae:0.74385
[3]	valid-mae:0.73662
[4]	valid-mae:0.72934
[5]	valid-mae:0.72275
[6]	valid-mae:0.71629
[7]	valid-mae:0.71019
[8]	valid-mae:0.70467
[9]	valid-mae:0.69928
[10]	valid-mae:0.69381
[11]	valid-mae:0.68896
[12]	valid-mae:0.68374
[13]	valid-mae:0.67909
[14]	valid-mae:0.67477
[15]	valid-mae:0.67066
[16]	valid-mae:0.66700
[17]	valid-mae:0.66331
[18]	valid-mae:0.65942
[19]	valid-mae:0.65622
[20]	valid-mae:0.65294
[21]	valid-mae:0.64956
[22]	valid-mae:0.64659
[23]	valid-mae:0.64380
[24]	valid-mae:0.64105
[25]	valid-mae:0.63875
[26]	valid-mae:0.63633
[27]	valid-mae:0.63427
[28]	valid-mae:0.63234
[29]	valid-mae:0.63007
[30]	valid-mae:0.62791
[31]	valid-mae:0.6

In [33]:
# ==== Price-band experts with OOF (word+char TF-IDF Ridge on log price) ====
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from scipy.sparse import hstack

def to_log(y): return np.log1p(np.maximum(y, 1e-6))
def from_log(yhat): return np.expm1(yhat)
def smape(y_true,y_pred):
    y_true=np.asarray(y_true,float); y_pred=np.asarray(y_pred,float)
    denom=(np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom,1e-9,None))*100

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
tr_txt = train[TEXT_COL].fillna("").astype(str)
te_txt = test[TEXT_COL].fillna("").astype(str)

# shared TF-IDF space for all experts (saves time)
vec_w = TfidfVectorizer(ngram_range=(1,2), max_features=250_000, min_df=3, max_df=0.9, sublinear_tf=True)
vec_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=250_000, min_df=3, sublinear_tf=True)
Xw = vec_w.fit_transform(tr_txt); Xw_te = vec_w.transform(te_txt)
Xc = vec_c.fit_transform(tr_txt); Xc_te = vec_c.transform(te_txt)
X  = hstack([Xw, Xc], format="csr"); Xte = hstack([Xw_te, Xc_te], format="csr")

lp = to_log(train["price"].values)
q1, q2 = np.quantile(lp, [0.33, 0.66])

def mask_bins(y):
    return (y<=q1), ((y>q1)&(y<=q2)), (y>q2)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_experts = np.zeros(len(train))
experts_log_test = np.zeros(len(test))

for tr_idx, va_idx in kf.split(X):
    ytr = lp[tr_idx]
    m_lo, m_md, m_hi = mask_bins(ytr)

    # train three subset Ridges
    r_lo = Ridge(alpha=1.1).fit(X[tr_idx][m_lo], ytr[m_lo])
    r_md = Ridge(alpha=1.1).fit(X[tr_idx][m_md], ytr[m_md])
    r_hi = Ridge(alpha=1.1).fit(X[tr_idx][m_hi], ytr[m_hi])

    # gating by distance to bin means (using a quick anchor from global text oof/train)
    c_lo, c_md, c_hi = ytr[m_lo].mean(), ytr[m_md].mean(), ytr[m_hi].mean()

    # predict on fold valid & test
    p_lo_va = r_lo.predict(X[va_idx]); p_md_va = r_md.predict(X[va_idx]); p_hi_va = r_hi.predict(X[va_idx])
    p_lo_te = r_lo.predict(Xte);      p_md_te = r_md.predict(Xte);      p_hi_te = r_hi.predict(Xte)

    # anchor: use global text oof for va; for test use text_test_pred (already computed)
    anchor_va = np.log1p(np.maximum(from_log(oof_log_text[va_idx]),1e-6))
    anchor_te = np.log1p(np.maximum(text_test_pred,1e-6))

    def soft_weights(anchor, c_lo, c_md, c_hi):
        d = np.stack([np.abs(anchor-c_lo), np.abs(anchor-c_md), np.abs(anchor-c_hi)], 1)
        w = 1/(d+1e-6); w /= w.sum(1, keepdims=True); return w

    W_va = soft_weights(anchor_va, c_lo, c_md, c_hi)
    W_te = soft_weights(anchor_te, c_lo, c_md, c_hi)

    oof_log_experts[va_idx] = W_va[:,0]*p_lo_va + W_va[:,1]*p_md_va + W_va[:,2]*p_hi_va
    experts_log_test       += W_te[:,0]*p_lo_te + W_te[:,1]*p_md_te + W_te[:,2]*p_hi_te

experts_log_test /= kf.get_n_splits()
experts_test_pred = from_log(experts_log_test)
print("OOF SMAPE (Experts):", smape(train["price"].values, from_log(oof_log_experts)))

# Re-blend with experts OOF included
oof_text  = from_log(oof_log_text)
oof_dense = from_log(oof_log_gbm)
oof_expt  = from_log(oof_log_experts)

cands=[]
for wt in np.linspace(0.40,0.80,9):
    for wd in np.linspace(0.10,0.45,8):
        we = 1.0 - wt - wd
        if we>=0:
            pred = wt*oof_text + wd*oof_dense + we*oof_expt
            cands.append((smape(train["price"].values, pred), wt, wd, we))
best = min(cands, key=lambda x: x[0])
bsm, WT, WD, WE = best
print(f"New OOF blend with experts → SMAPE={bsm:.4f}, WT={WT:.3f}, WD={WD:.3f}, WE={WE:.3f}")

final_test_pred = WT*text_test_pred + WD*dense_test_pred + WE*experts_test_pred


OOF SMAPE (Experts): 54.651830986604956
New OOF blend with experts → SMAPE=50.4938, WT=0.650, WD=0.350, WE=0.000


In [34]:
# ==== Try price-space XGB (reg:absoluteerror) vs log-space; auto-pick best, blend, and write ====
import numpy as np, pandas as pd, xgboost as xgb
from sklearn.model_selection import KFold
from packaging import version
from pathlib import Path

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0
def from_log(yhat): return np.expm1(yhat)

assert "Xtr_dense" in globals() and "Xte_dense" in globals(), "Build dense features first."
assert "y_log" in globals(), "y_log missing (log(price))."
assert "oof_log_text" in globals() and "text_test_pred" in globals(), "Run the text ridge cell first."

y_price = train["price"].values.copy()

# --- version-aware GPU config ---
def make_params(obj, lr=0.045, depth=10, subs=0.85, col=0.85, l2=1.5, n_estim=8000, verbosity=1):
    params = dict(
        objective=obj,
        eval_metric="mae",
        learning_rate=lr,
        max_depth=depth,
        subsample=subs,
        colsample_bytree=col,
        reg_alpha=0.0,
        reg_lambda=l2,
        n_estimators=n_estim,
        verbosity=verbosity
    )
    if version.parse(xgb.__version__) >= version.parse("2.0.0"):
        params.update(device="cuda", tree_method="hist")
    else:
        params.update(tree_method="gpu_hist", predictor="gpu_predictor")
    return params

# -------------------------
# Model A: log(price) + MAE (your current style)
# -------------------------
params_A = make_params("reg:squarederror", lr=0.045, depth=10, subs=0.85, col=0.85, l2=1.5, n_estim=8000)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_A = np.zeros(len(train))
test_log_A = np.zeros(len(test))

for tr_idx, va_idx in kf.split(Xtr_dense):
    dtr = xgb.DMatrix(Xtr_dense[tr_idx], label=y_log[tr_idx])
    dva = xgb.DMatrix(Xtr_dense[va_idx], label=y_log[va_idx])
    dte = xgb.DMatrix(Xte_dense)

    mdl = xgb.train(params_A, dtr, num_boost_round=params_A["n_estimators"],
                    evals=[(dva, "valid")], early_stopping_rounds=300, verbose_eval=False)
    oof_log_A[va_idx] = mdl.predict(dva, iteration_range=(0, mdl.best_iteration+1))
    test_log_A += mdl.predict(dte, iteration_range=(0, mdl.best_iteration+1))

test_log_A /= kf.get_n_splits()
A_oof_price = from_log(oof_log_A)
A_test_price = from_log(test_log_A)
A_smape = smape(y_price, A_oof_price)
print(f"[A] log-price XGB → OOF SMAPE: {A_smape:.4f}")

# -------------------------
# Model B: price-space + absolute error (aligns better with SMAPE)
# -------------------------
# XGBoost supports reg:absoluteerror objective; we train on raw price.
params_B = make_params("reg:absoluteerror", lr=0.04, depth=10, subs=0.9, col=0.9, l2=1.2, n_estim=10000)
oof_B = np.zeros(len(train))
test_B = np.zeros(len(test))

for tr_idx, va_idx in kf.split(Xtr_dense):
    dtr = xgb.DMatrix(Xtr_dense[tr_idx], label=y_price[tr_idx])
    dva = xgb.DMatrix(Xtr_dense[va_idx], label=y_price[va_idx])
    dte = xgb.DMatrix(Xte_dense)

    mdl = xgb.train(params_B, dtr, num_boost_round=params_B["n_estimators"],
                    evals=[(dva, "valid")], early_stopping_rounds=400, verbose_eval=False)
    oof_B[va_idx] = mdl.predict(dva, iteration_range=(0, mdl.best_iteration+1))
    test_B += mdl.predict(dte, iteration_range=(0, mdl.best_iteration+1))

test_B /= kf.get_n_splits()
B_oof_price = oof_B
B_test_price = test_B
B_smape = smape(y_price, B_oof_price)
print(f"[B] price-space XGB (reg:absoluteerror) → OOF SMAPE: {B_smape:.4f}")

# --- pick best dense model for blend ---
use_B = B_smape < A_smape
oof_dense_price = B_oof_price if use_B else A_oof_price
dense_test_pred = B_test_price if use_B else A_test_price
print("Chosen dense model:", "B (abs-error on price)" if use_B else "A (MAE on log-price)")

# --- OOF-tuned blend with text (optionally experts if you have OOF for them) ---
oof_text_price = from_log(oof_log_text)
cands=[]
if 'oof_log_experts' in globals():
    oof_expt_price = from_log(oof_log_experts)
    for wt in np.linspace(0.40,0.80,9):
        for wd in np.linspace(0.10,0.50,9):
            we = 1.0 - wt - wd
            if we >= 0:
                pred = wt*oof_text_price + wd*oof_dense_price + we*oof_expt_price
                cands.append((smape(y_price, pred), wt, wd, we))
    best = min(cands, key=lambda x: x[0])
    best_smape, WT, WD, WE = best
else:
    for wt in np.linspace(0.55,0.95,9):
        for wd in np.linspace(0.05,0.45,9):
            if wt + wd <= 1.0:
                pred = wt*oof_text_price + wd*oof_dense_price
                cands.append((smape(y_price, pred), wt, wd))
    best_smape, WT, WD = min(cands, key=lambda x: x[0])
    WE = 0.0

print(f"Best OOF blend → SMAPE={best_smape:.4f} | Weights: Text={WT:.3f} Dense={WD:.3f} Experts={WE:.3f}")

# --- final test preds & write CSV (robust writer) ---
experts_part = (experts_test_pred if 'experts_test_pred' in globals() else 0.0)
final_pred = WT*text_test_pred + WD*dense_test_pred + WE*experts_part

ART = Path("artifacts"); ART.mkdir(parents=True, exist_ok=True)
order = test[["sample_id"]].copy()
out = order.merge(pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final_pred, 1e-6, None)}),
                  on="sample_id", how="left")
out.to_csv(ART/"test_out.csv", index=False, float_format="%.6f")
print("✅ test_out.csv:", (ART/"test_out.csv").resolve())


[A] log-price XGB → OOF SMAPE: 53.1686
[B] price-space XGB (reg:absoluteerror) → OOF SMAPE: 57.2977
Chosen dense model: A (MAE on log-price)
Best OOF blend → SMAPE=50.6896 | Weights: Text=0.650 Dense=0.350 Experts=0.000
✅ test_out.csv: D:\amazon ML challenge\notebooks\artifacts\test_out.csv


In [35]:
# ==== Multi-text Ridge ensemble + SMAPE-optimized blend with dense (XGB) ====
import numpy as np, pandas as pd, re, random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from scipy.sparse import hstack
from pathlib import Path

# --- helpers ---
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0
def to_log(y):  return np.log1p(np.maximum(y, 1e-6))
def from_log(y): return np.expm1(y)
rng = np.random.RandomState(42)

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# --- light normalization to help n-grams generalize (FAST) ---
_unit_norm = [(r'\bkilograms?\b',' kg '),(r'\bgrams?\b',' g '),(r'\blit(res?|ers?)\b',' l ')]
def clean_text(s):
    if not isinstance(s,str): s=""
    s = s.lower()
    # normalize units
    for pat,rep in _unit_norm: s = re.sub(pat, rep, s)
    # normalize x sign & numbers
    s = s.replace('×',' x ')
    s = re.sub(r'\d+(\.\d+)?',' <num> ', s)
    s = re.sub(r'\s+',' ', s).strip()
    return s

tr_txt = train[TEXT_COL].fillna("").astype(str).map(clean_text)
te_txt = test[TEXT_COL].fillna("").astype(str).map(clean_text)

y = train["price"].values
y_log = to_log(y)

# --- define several text featurizers (diversity helps) ---
vecs = {
    "w12":  TfidfVectorizer(ngram_range=(1,2), max_features=350_000, min_df=3, max_df=0.95, sublinear_tf=True),
    "w13":  TfidfVectorizer(ngram_range=(1,3), max_features=400_000, min_df=3, max_df=0.97, sublinear_tf=True),
    "c36":  TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=400_000, min_df=3, sublinear_tf=True),
    # union word+char for a strong baseline
}

# Fit individual spaces once
X_w12_tr = vecs["w12"].fit_transform(tr_txt);   X_w12_te = vecs["w12"].transform(te_txt)
X_w13_tr = vecs["w13"].fit_transform(tr_txt);   X_w13_te = vecs["w13"].transform(te_txt)
X_c36_tr = vecs["c36"].fit_transform(tr_txt);   X_c36_te = vecs["c36"].transform(te_txt)

# Unions
from scipy.sparse import csr_matrix
def U(*mats): return hstack(mats, format="csr")
spaces = {
    "T_w12": X_w12_tr,                     "T_w12_te": X_w12_te,
    "T_w13": X_w13_tr,                     "T_w13_te": X_w13_te,
    "T_c36": X_c36_tr,                     "T_c36_te": X_c36_te,
    "U_w12_c36": U(X_w12_tr, X_c36_tr),    "U_w12_c36_te": U(X_w12_te, X_c36_te),
    "U_w13_c36": U(X_w13_tr, X_c36_tr),    "U_w13_c36_te": U(X_w13_te, X_c36_te),
}

# --- small alpha sweep per space, collect OOF and test preds in price-space ---
alphas = [0.6, 1.1, 2.0]
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_bank = {}   # name -> OOF (price)
test_bank = {}  # name -> test preds (price)

for key in ["T_w12","T_w13","T_c36","U_w12_c36","U_w13_c36"]:
    Xtr = spaces[key]; Xte = spaces[key+"_te"]
    for a in alphas:
        tag = f"{key}_a{a}"
        oof_log = np.zeros(len(train)); test_log = np.zeros(len(test))
        for tr_idx, va_idx in kf.split(Xtr):
            mdl = Ridge(alpha=a, random_state=42)
            mdl.fit(Xtr[tr_idx], y_log[tr_idx])
            oof_log[va_idx] = mdl.predict(Xtr[va_idx])
            test_log += mdl.predict(Xte)
        test_log /= kf.get_n_splits()
        oof_bank[tag]  = from_log(oof_log)
        test_bank[tag] = from_log(test_log)
        print(f"{tag}: OOF SMAPE={smape(y, oof_bank[tag]):.4f}")

# --- include your existing dense model OOF + test ---
assert "oof_log_gbm" in globals() and "dense_test_pred" in globals(), "Run the dense XGB block first."
oof_bank["DENSE"]  = from_log(oof_log_gbm)
test_bank["DENSE"] = dense_test_pred
print("DENSE: OOF SMAPE=", smape(y, oof_bank["DENSE"]))

# --- Optional: include your earlier single text model if different config existed
if "oof_log_text" in globals():
    oof_bank["TEXT_BASE"]  = from_log(oof_log_text)
    test_bank["TEXT_BASE"] = from_log(np.mean(oof_log_text)*0 + text_test_pred)  # test preds already computed
    print("TEXT_BASE: OOF SMAPE=", smape(y, oof_bank["TEXT_BASE"]))

# --- SMAPE-optimal nonnegative blend via Dirichlet sampling on the simplex ---
keys = list(oof_bank.keys())
OOFs = np.column_stack([oof_bank[k] for k in keys])
TESTs = np.column_stack([test_bank[k] for k in keys])

def search_weights(OOFs, tries=4000, temperature=0.7, seed=42):
    rng = np.random.RandomState(seed)
    best = (1e9, None)
    for _ in range(tries):
        w = rng.dirichlet(np.ones(OOFs.shape[1]) * temperature)
        pred = OOFs @ w
        s = smape(y, pred)
        if s < best[0]:
            best = (s, w)
    return best

best_smape, w = search_weights(OOFs, tries=5000, temperature=0.6, seed=42)
print(f"\nBest OOF SMAPE via blend = {best_smape:.4f}")
blend_contrib = pd.Series(w, index=keys).sort_values(ascending=False)
print("Blend weights (sum=1):\n", blend_contrib.round(4))

# --- Build final prediction and write ---
final_test_pred = TESTs @ w
ART = Path("artifacts"); ART.mkdir(parents=True, exist_ok=True)
out = pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final_test_pred, 1e-6, None)})
out.to_csv(ART/"test_out.csv", index=False, float_format="%.6f")
print("✅ test_out.csv:", (ART/"test_out.csv").resolve())


T_w12_a0.6: OOF SMAPE=53.2249
T_w12_a1.1: OOF SMAPE=53.2427
T_w12_a2.0: OOF SMAPE=53.7595
T_w13_a0.6: OOF SMAPE=52.6265
T_w13_a1.1: OOF SMAPE=52.6536
T_w13_a2.0: OOF SMAPE=53.2138
T_c36_a0.6: OOF SMAPE=55.2122
T_c36_a1.1: OOF SMAPE=55.1824
T_c36_a2.0: OOF SMAPE=55.4689
U_w12_c36_a0.6: OOF SMAPE=53.4815
U_w12_c36_a1.1: OOF SMAPE=53.1223
U_w12_c36_a2.0: OOF SMAPE=53.2258
U_w13_c36_a0.6: OOF SMAPE=52.9275
U_w13_c36_a1.1: OOF SMAPE=52.5782
U_w13_c36_a2.0: OOF SMAPE=52.7235
DENSE: OOF SMAPE= 52.7401417475817
TEXT_BASE: OOF SMAPE= 51.6343230851525

Best OOF SMAPE via blend = 51.0554
Blend weights (sum=1):
 DENSE             0.3762
TEXT_BASE         0.1942
T_w13_a1.1        0.1126
T_w12_a0.6        0.0684
T_c36_a2.0        0.0556
T_w13_a0.6        0.0527
T_w13_a2.0        0.0201
U_w13_c36_a0.6    0.0195
U_w12_c36_a0.6    0.0191
U_w12_c36_a1.1    0.0188
U_w12_c36_a2.0    0.0144
T_c36_a0.6        0.0132
U_w13_c36_a2.0    0.0123
U_w13_c36_a1.1    0.0101
T_w12_a2.0        0.0073
T_c36_a1.1       

In [36]:
# ==== EMERGENCY SMAPE DROP KIT (no retrain): segmented isotonic + meta-bias corrector ====
import numpy as np, pandas as pd, re
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import HuberRegressor
from pathlib import Path

# ---------- helpers ----------
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0
def from_log(yhat): return np.expm1(yhat)

assert 'oof_log_text' in globals() and 'text_test_pred' in globals(), "Need text OOF/preds."
assert 'oof_log_gbm'  in globals() and 'dense_test_pred' in globals(), "Need dense OOF/preds."

y_true = train["price"].values.astype(float)

# ---------- 0) Build the base blended predictions (OOF + test) ----------
# Option A: If you ran the Dirichlet blend earlier, reuse it.
reuse_dirichlet = 'w' in globals() and isinstance(w, np.ndarray)

if reuse_dirichlet and 'oof_bank' in globals() and 'test_bank' in globals():
    keys = list(oof_bank.keys())
    OOFs = np.column_stack([oof_bank[k] for k in keys])
    TESTs= np.column_stack([test_bank[k] for k in keys])
    oof_blend = OOFs @ w
    test_blend= TESTs @ w
else:
    # Option B: fall back to 2-model blend (text + dense) with quick OOF grid
    oof_text  = from_log(oof_log_text)
    oof_dense = from_log(oof_log_gbm)
    cands=[]
    for wt in np.linspace(0.55,0.95,9):
        for wd in np.linspace(0.05,0.45,9):
            if wt+wd<=1.0:
                pred = wt*oof_text + wd*oof_dense
                cands.append((smape(y_true, pred), wt, wd))
    best_smape, WT, WD = min(cands, key=lambda x: x[0])
    oof_blend = WT*oof_text + WD*oof_dense
    # test-side
    test_blend = WT*text_test_pred + WD*dense_test_pred

print("Base blend OOF SMAPE:", smape(y_true, oof_blend))

# ---------- 1) Segmented isotonic calibration (3 bins by blended OOF quantiles) ----------
q1, q2 = np.quantile(oof_blend, [0.33, 0.66])

def fit_seg_iso(oof_pred, y):
    bins = np.digitize(oof_pred, [q1, q2])  # 0,1,2
    isos = {}
    for b in (0,1,2):
        m = bins==b
        # guard: if bin tiny, merge to neighbor
        if m.sum() < 100:
            # merge with closest bin by index
            m = (bins==max(0, min(2,b)))
        iso = IsotonicRegression(y_min=1e-6, increasing=True, out_of_bounds='clip')
        iso.fit(oof_pred[m], y[m])
        isos[b] = iso
    return isos

isos = fit_seg_iso(oof_blend, y_true)

def apply_seg_iso(pred):
    out = np.empty_like(pred, dtype=float)
    bins = np.digitize(pred, [q1, q2])
    for b in (0,1,2):
        m = bins==b
        out[m] = isos[b].transform(pred[m])
    return out

oof_iso = apply_seg_iso(oof_blend)
test_iso = apply_seg_iso(test_blend)
print("After segmented isotonic, OOF SMAPE:", smape(y_true, oof_iso))

# ---------- 2) Tiny meta bias corrector (Huber): price ~ a*blend + b + c*log_pack + d*log_qty ----------
_QTY_RE   = re.compile(r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|g|kg)\b', re.I)
_PACK_RE  = re.compile(r'(?:pack\s*of|x|×)\s*(\d{1,3})', re.I)

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
def fast_meta_vec(s):
    if not isinstance(s,str): s=""
    s=s.lower()
    # pack
    pack=1
    m=_PACK_RE.search(s)
    if m:
        try: pack=int(m.group(1))
        except: pack=1
    # qty in base units
    qty=0.0
    for q,u in _QTY_RE.findall(s):
        q=float(q); u=u.lower()
        if u in ("kg","g"): qty += q*(1000.0 if u=="kg" else 1.0)
        else:               qty += q*(1000.0 if u in ("l","liter","litre") else 1.0)
    return pack, qty

def build_meta_cols(df):
    P=[]; Q=[]
    for s in df[TEXT_COL].fillna("").astype(str):
        p, q = fast_meta_vec(s)
        P.append(p); Q.append(q)
    P = np.log1p(np.array(P, dtype=float))
    Q = np.log1p(np.array(Q, dtype=float))
    return P, Q

P_tr, Q_tr = build_meta_cols(train)
P_te, Q_te = build_meta_cols(test)

# Huber on OOF: features = [blend, log_pack, log_qty] (fit on isotonic-corrected blend for stability)
X_oof = np.column_stack([oof_iso, P_tr, Q_tr])
X_te  = np.column_stack([test_iso, P_te, Q_te])

hub = HuberRegressor(alpha=1e-4, epsilon=1.35, max_iter=500)
hub.fit(X_oof, y_true)
oof_hub = hub.predict(X_oof)
test_hub= hub.predict(X_te)
print("After Huber correction, OOF SMAPE:", smape(y_true, oof_hub))

# ---------- 3) Pick best of (blend, iso, hub) or mix iso+hub by small grid ----------
cands = {
    "blend" : oof_blend,
    "iso"   : oof_iso,
    "hub"   : oof_hub,
}

best_key, best_oof, best_sm = None, None, 1e9
for k, arr in cands.items():
    s = smape(y_true, arr)
    if s < best_sm:
        best_key, best_oof, best_sm = k, arr, s

# Try mixing iso & hub (often wins a bit)
mixes=[]
for w in np.linspace(0.2, 0.8, 7):
    mixes.append((w, smape(y_true, w*oof_hub + (1-w)*oof_iso)))
if mixes:
    w_best, s_best = min(mixes, key=lambda x: x[1])
    if s_best < best_sm:
        best_key = f"mix_iso_hub_{w_best:.2f}"
        best_oof = w_best*oof_hub + (1-w_best)*oof_iso
        best_sm  = s_best
        test_best= w_best*test_hub + (1-w_best)*test_iso
    else:
        test_best = {"blend":test_blend, "iso":test_iso, "hub":test_hub}[best_key]
else:
    test_best = {"blend":test_blend, "iso":test_iso, "hub":test_hub}[best_key]

print(f"Chosen post-proc: {best_key} | OOF SMAPE={best_sm:.4f}")

# ---------- 4) Safety clipping to train price range (winsorize 0.5–99.5%) ----------
lo = np.percentile(y_true, 0.5)
hi = np.percentile(y_true, 99.5)
final = np.clip(test_best, lo, hi)

# ---------- 5) Write submission ----------
ART = Path("artifacts"); ART.mkdir(parents=True, exist_ok=True)
sub = pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final, 1e-6, None)})
sub.to_csv(ART/"test_out_fastfix.csv", index=False, float_format="%.6f")
print("✅ Wrote:", (ART/"test_out_fastfix.csv").resolve())
print("Pre/post OOF (for reference) → base:", smape(y_true, oof_blend), 
      "| iso:", smape(y_true, oof_iso), "| hub:", smape(y_true, oof_hub), "| chosen:", best_sm)


Base blend OOF SMAPE: 51.05536471285264
After segmented isotonic, OOF SMAPE: 55.14905760870356
After Huber correction, OOF SMAPE: 50.704719403334956
Chosen post-proc: hub | OOF SMAPE=50.7047
✅ Wrote: D:\amazon ML challenge\notebooks\artifacts\test_out_fastfix.csv
Pre/post OOF (for reference) → base: 51.05536471285264 | iso: 55.14905760870356 | hub: 50.704719403334956 | chosen: 50.704719403334956


In [37]:
# ==== Price-band experts (with OOF) + re-blend + write ====
import numpy as np, pandas as pd, re
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack
from pathlib import Path

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0

def to_log(y):  return np.log1p(np.maximum(y, 1e-6))
def from_log(y): return np.expm1(y)

# --- sanity: need base OOF/test preds you already computed ---
assert 'oof_log_text' in globals() and 'text_test_pred' in globals(), "Run Text Ridge cell first."
assert 'oof_log_gbm'  in globals() and 'dense_test_pred' in globals(), "Run dense XGB cell first."

y = train["price"].values.astype(float)
lp = to_log(y)
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
tr_txt = train[TEXT_COL].fillna("").astype(str)
te_txt = test[TEXT_COL].fillna("").astype(str)

# --- Use a strong shared TF-IDF space (re-use if already built) ---
if 'X_w13_tr' in globals() and 'X_c36_tr' in globals():
    from scipy.sparse import csr_matrix
    def U(a,b): return hstack([a,b], format="csr")
    Xtr = U(X_w13_tr, X_c36_tr); Xte = U(X_w13_te, X_c36_te)
else:
    vec_w = TfidfVectorizer(ngram_range=(1,3), max_features=250_000, min_df=3, max_df=0.97, sublinear_tf=True)
    vec_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=200_000, min_df=3, sublinear_tf=True)
    Xw_tr = vec_w.fit_transform(tr_txt); Xw_te = vec_w.transform(te_txt)
    Xc_tr = vec_c.fit_transform(tr_txt); Xc_te = vec_c.transform(te_txt)
    from scipy.sparse import hstack
    Xtr = hstack([Xw_tr, Xc_tr], format="csr"); Xte = hstack([Xw_te, Xc_te], format="csr")

# --- Stratified folds in log-price deciles (more stable OOF) ---
bins = pd.qcut(lp, q=10, labels=False, duplicates='drop')
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- Price-band definitions in log-space (within each training fold) ---
def bin_masks(lv):
    q1, q2 = np.quantile(lv, [0.33, 0.66])
    return (lv<=q1), ((lv>q1)&(lv<=q2)), (lv>q2)

def soft_weights(anchor, c_lo, c_md, c_hi):
    d = np.stack([np.abs(anchor-c_lo), np.abs(anchor-c_md), np.abs(anchor-c_hi)], 1)
    w = 1/(d+1e-6); w /= w.sum(1, keepdims=True); return w

# --- Train experts with OOF ---
oof_log_experts = np.zeros(len(train))
experts_log_test_accum = np.zeros((len(test), 5))  # store per-fold test preds then average

for fold, (tr_idx, va_idx) in enumerate(skf.split(Xtr, bins), 1):
    ytr_log = lp[tr_idx]
    m_lo, m_md, m_hi = bin_masks(ytr_log)

    # three ridge experts trained on TRAIN fold subsets
    r_lo = Ridge(alpha=1.1, random_state=42).fit(Xtr[tr_idx][m_lo], ytr_log[m_lo])
    r_md = Ridge(alpha=1.1, random_state=42).fit(Xtr[tr_idx][m_md], ytr_log[m_md])
    r_hi = Ridge(alpha=1.1, random_state=42).fit(Xtr[tr_idx][m_hi], ytr_log[m_hi])

    # centers for soft gating
    c_lo, c_md, c_hi = ytr_log[m_lo].mean(), ytr_log[m_md].mean(), ytr_log[m_hi].mean()

    # validation predictions (log-space)
    p_lo_va = r_lo.predict(Xtr[va_idx]); p_md_va = r_md.predict(Xtr[va_idx]); p_hi_va = r_hi.predict(Xtr[va_idx])

    # anchor for gating = OOF text (on val fold) in log-space (no leakage)
    anchor_va = np.log1p(np.maximum(from_log(oof_log_text[va_idx]), 1e-6))
    W_va = soft_weights(anchor_va, c_lo, c_md, c_hi)
    oof_log_experts[va_idx] = W_va[:,0]*p_lo_va + W_va[:,1]*p_md_va + W_va[:,2]*p_hi_va

    # test predictions gated by text test anchor
    anchor_te = np.log1p(np.maximum(text_test_pred, 1e-6))
    p_lo_te = r_lo.predict(Xte); p_md_te = r_md.predict(Xte); p_hi_te = r_hi.predict(Xte)
    W_te = soft_weights(anchor_te, c_lo, c_md, c_hi)
    experts_log_test_accum[:, fold-1] = W_te[:,0]*p_lo_te + W_te[:,1]*p_md_te + W_te[:,2]*p_hi_te

experts_log_test = experts_log_test_accum.mean(1)
experts_test_pred = from_log(experts_log_test)

print("Experts OOF SMAPE:", smape(y, from_log(oof_log_experts)))

# --- Re-blend with experts OOF included (grid over simplex) ---
oof_text  = from_log(oof_log_text)
oof_dense = from_log(oof_log_gbm)
oof_expt  = from_log(oof_log_experts)

best = (1e9, None)
for wt in np.linspace(0.40, 0.80, 9):
    for wd in np.linspace(0.10, 0.50, 9):
        we = 1.0 - wt - wd
        if we >= 0:
            pred = wt*oof_text + wd*oof_dense + we*oof_expt
            s = smape(y, pred)
            if s < best[0]:
                best = (s, (wt, wd, we))
best_smape, (WT, WD, WE) = best
print(f"Re-blend OOF SMAPE: {best_smape:.4f} | Weights  Text={WT:.3f} Dense={WD:.3f} Experts={WE:.3f}")

# --- Final preds + write (no dependency on data/test.csv) ---
final = WT*text_test_pred + WD*dense_test_pred + WE*experts_test_pred
ART = Path("artifacts"); ART.mkdir(parents=True, exist_ok=True)
out = pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final, 1e-6, None)})
out.to_csv(ART/"test_out_experts_blend.csv", index=False, float_format="%.6f")
print("✅ Wrote:", (ART/"test_out_experts_blend.csv").resolve())


Experts OOF SMAPE: 54.829658586827094
Re-blend OOF SMAPE: 50.4938 | Weights  Text=0.650 Dense=0.350 Experts=0.000
✅ Wrote: D:\amazon ML challenge\notebooks\artifacts\test_out_experts_blend.csv


In [38]:
# ==== Stage-2 stacker (GPU) on OOF signals + meta → reblend and write ====
import numpy as np, pandas as pd, re, xgboost as xgb
from sklearn.model_selection import KFold
from packaging import version
from pathlib import Path

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom, 1e-9, None))*100.0

def from_log(yhat): return np.expm1(yhat)

# --- sanity: need base OOF/test predictions ---
assert 'oof_log_text' in globals() and 'text_test_pred' in globals()
assert 'oof_log_gbm'  in globals() and 'dense_test_pred' in globals()

y = train["price"].values.astype(float)

# Base OOF/test signals (price space)
oof_text  = from_log(oof_log_text)
oof_dense = from_log(oof_log_gbm)
oof_feats = [oof_text, oof_dense]
te_feats  = [text_test_pred, dense_test_pred]
feat_names = ["oof_text", "oof_dense"]

use_experts = 'oof_log_experts' in globals() and 'experts_test_pred' in globals()
if use_experts:
    oof_expt = from_log(oof_log_experts)
    oof_feats.append(oof_expt)
    te_feats.append(experts_test_pred)
    feat_names.append("oof_experts")

# --- tiny meta features (fast)
_QTY_RE   = re.compile(r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|g|kg)\b', re.I)
_PACK_RE  = re.compile(r'(?:pack\s*of|x|×)\s*(\d{1,3})', re.I)
_NUMTOK_RE= re.compile(r'\d+')
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

def quick_meta_vecs(series):
    P=[]; Q=[]; L=[]; D=[]; ATL=[]
    for s in series.fillna("").astype(str):
        s_l=s.lower()
        # pack
        pack=1
        m=_PACK_RE.search(s_l)
        if m:
            try: pack=int(m.group(1))
            except: pack=1
        # qty base units
        qty=0.0
        for q,u in _QTY_RE.findall(s_l):
            q=float(q); u=u.lower()
            if u in ("kg","g"): qty += q*(1000.0 if u=="kg" else 1.0)
            else:               qty += q*(1000.0 if u in ("l","liter","litre") else 1.0)
        n_chars=len(s_l); n_spaces=s_l.count(" "); n_digits=len(_NUMTOK_RE.findall(s_l))
        P.append(np.log1p(pack)); Q.append(np.log1p(qty))
        L.append(np.log1p(n_chars)); D.append(np.log1p(n_digits)); ATL.append(np.log1p(n_chars/(n_spaces+1)))
    return np.array(P), np.array(Q), np.array(L), np.array(D), np.array(ATL)

P_tr,Q_tr,L_tr,D_tr,ATL_tr = quick_meta_vecs(train[TEXT_COL])
P_te,Q_te,L_te,D_te,ATL_te = quick_meta_vecs(test[TEXT_COL])

# --- build Stage-2 features
X2_tr = np.column_stack(oof_feats + [P_tr,Q_tr,L_tr,D_tr,ATL_tr])
X2_te = np.column_stack(te_feats  + [P_te,Q_te,L_te,D_te,ATL_te])
x2_cols = feat_names + ["meta_pack","meta_qty","meta_len","meta_digits","meta_avg_tok_len"]

# --- Stage-2 XGB on GPU: train on price directly with MAE metric (robust)
params = dict(
    objective="reg:squarederror",
    eval_metric="mae",
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    n_estimators=8000,
    verbosity=0
)
if version.parse(xgb.__version__) >= version.parse("2.0.0"):
    params.update(device="cuda", tree_method="hist")
else:
    params.update(tree_method="gpu_hist", predictor="gpu_predictor")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof2 = np.zeros(len(train))
te2_accum = np.zeros(len(test))
for tr_idx, va_idx in kf.split(X2_tr):
    dtr = xgb.DMatrix(X2_tr[tr_idx], label=y[tr_idx])
    dva = xgb.DMatrix(X2_tr[va_idx], label=y[va_idx])
    dte = xgb.DMatrix(X2_te)
    mdl = xgb.train(params, dtr, num_boost_round=params["n_estimators"],
                    evals=[(dva,"valid")], early_stopping_rounds=300, verbose_eval=False)
    oof2[va_idx] = mdl.predict(dva, iteration_range=(0, mdl.best_iteration+1))
    te2_accum   += mdl.predict(dte, iteration_range=(0, mdl.best_iteration+1))
te2 = te2_accum / kf.get_n_splits()
print("Stage-2 OOF SMAPE:", smape(y, oof2))

# --- Blend base blend with stage-2 using OOF SMAPE-optimal weight
# base_blend OOF: best current (choose what you used last; here we form a simple text+dense(+(experts)) blend via OOF search)
base_oofs = np.column_stack([oof_text, oof_dense] + ([oof_expt] if use_experts else []))
base_tests= np.column_stack([text_test_pred, dense_test_pred] + ([experts_test_pred] if use_experts else []))

# Search simplex for base weights first
best = (1e9, None)
if use_experts:
    for wt in np.linspace(0.40,0.80,9):
        for wd in np.linspace(0.10,0.50,9):
            we = 1.0 - wt - wd
            if we>=0:
                pred = wt*oof_text + wd*oof_dense + we*oof_expt
                s = smape(y, pred)
                if s < best[0]:
                    best = (s, (wt,wd,we))
    base_smape, (bWT,bWD,bWE) = best
    base_oof  = bWT*oof_text + bWD*oof_dense + bWE*oof_expt
    base_test = bWT*text_test_pred + bWD*dense_test_pred + bWE*experts_test_pred
else:
    for wt in np.linspace(0.55,0.95,9):
        for wd in np.linspace(0.05,0.45,9):
            if wt+wd<=1.0:
                pred = wt*oof_text + wd*oof_dense
                s = smape(y, pred)
                if s < best[0]:
                    best = (s, (wt,wd))
    base_smape, (bWT,bWD) = best
    bWE=0.0
    base_oof  = bWT*oof_text + bWD*oof_dense
    base_test = bWT*text_test_pred + bWD*dense_test_pred

print("Base OOF SMAPE (recomputed):", base_smape)

# Now OOF-optimal mix of base vs stage-2
best_mix = (1e9, None)
for w in np.linspace(0.0, 1.0, 41):  # 0..1 step 0.025
    pred = w*oof2 + (1-w)*base_oof
    s = smape(y, pred)
    if s < best_mix[0]:
        best_mix = (s, w)
best_smape, WM = best_mix
final_test_pred = WM*te2 + (1-WM)*base_test

print(f"Chosen mix weight (Stage-2 vs Base): {WM:.3f} | New OOF SMAPE: {best_smape:.4f}")

# --- Write submission (robust writer) ---
ART = Path("artifacts"); ART.mkdir(parents=True, exist_ok=True)
sub = pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final_test_pred, 1e-6, None)})
sub.to_csv(ART/"test_out_stack_blend.csv", index=False, float_format="%.6f")
print("✅ Wrote:", (ART/'test_out_stack_blend.csv').resolve())


Stage-2 OOF SMAPE: 53.67757269774788
Base OOF SMAPE (recomputed): 50.49384516885357
Chosen mix weight (Stage-2 vs Base): 0.000 | New OOF SMAPE: 50.4938
✅ Wrote: D:\amazon ML challenge\notebooks\artifacts\test_out_stack_blend.csv


In [39]:
# ==== Dense XGB on GPU with StratifiedKFold (log-price deciles) + extra regularization ====
import numpy as np, pandas as pd, xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from packaging import version

def smape(y_true, y_pred):
    y_true = np.asarray(y_true,float); y_pred=np.asarray(y_pred,float)
    denom=(np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom,1e-9,None))*100

assert "Xtr_dense" in globals() and "Xte_dense" in globals() and "y_log" in globals()
y_price = train["price"].values.astype(float)
bins = pd.qcut(y_log, q=10, labels=False, duplicates="drop")  # stratify in log space
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_params = dict(
    objective="reg:squarederror", eval_metric="mae",
    learning_rate=0.04, max_depth=9,
    subsample=0.9, colsample_bytree=0.9,
    reg_alpha=0.0, reg_lambda=2.0,      # a touch more L2
    min_child_weight=5,                 # stabilize leaves
    gamma=1.0,                          # discourage tiny splits
    n_estimators=10000, verbosity=0
)
if version.parse(xgb.__version__) >= version.parse("2.0.0"):
    xgb_params.update(device="cuda", tree_method="hist")
else:
    xgb_params.update(tree_method="gpu_hist", predictor="gpu_predictor")

oof_log_gbm = np.zeros(len(train)); gbm_test_log = np.zeros(len(test))
for fold, (tr_idx, va_idx) in enumerate(skf.split(Xtr_dense, bins), 1):
    dtr = xgb.DMatrix(Xtr_dense[tr_idx], label=y_log[tr_idx])
    dva = xgb.DMatrix(Xtr_dense[va_idx], label=y_log[va_idx])
    dte = xgb.DMatrix(Xte_dense)
    mdl = xgb.train(xgb_params, dtr, num_boost_round=xgb_params["n_estimators"],
                    evals=[(dva,"valid")], early_stopping_rounds=300, verbose_eval=False)
    oof_log_gbm[va_idx] = mdl.predict(dva, iteration_range=(0, mdl.best_iteration+1))
    gbm_test_log += mdl.predict(dte, iteration_range=(0, mdl.best_iteration+1))

gbm_test_log /= 5
dense_test_pred = np.expm1(gbm_test_log)
print("Dense(Stratified) OOF SMAPE:", smape(y_price, np.expm1(oof_log_gbm)))


Dense(Stratified) OOF SMAPE: 53.72016514649805


In [None]:
# ==== Rebuild dense WITHOUT image features, retrain XGB (GPU) ====
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
from sklearn.model_selection import KFold
from packaging import version

def smape(y_true,y_pred):
    y_true=np.asarray(y_true,float); y_pred=np.asarray(y_pred,float)
    denom=(np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom,1e-9,None))*100

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
tr_txt = train[TEXT_COL].fillna("").astype(str)
te_txt = test[TEXT_COL].fillna("").astype(str)

vec_w = TfidfVectorizer(ngram_range=(1,2), max_features=250_000, min_df=3, max_df=0.95, sublinear_tf=True)
vec_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=100_000, min_df=3, sublinear_tf=True)
Xw_tr = vec_w.fit_transform(tr_txt); Xw_te = vec_w.transform(te_txt)
Xc_tr = vec_c.fit_transform(tr_txt); Xc_te = vec_c.transform(te_txt)
from scipy.sparse import hstack
X_tr = hstack([Xw_tr, Xc_tr], format="csr")
X_te = hstack([Xw_te, Xc_te], format="csr")

svd = TruncatedSVD(n_components=320, random_state=42)
Xtr_svd = svd.fit_transform(X_tr).astype(np.float32)
Xte_svd = svd.transform(X_te).astype(np.float32)

# reuse your meta (if not present, rebuild quickly)
if 'M_tr' not in globals() or 'M_te' not in globals():
    import re
    _Q=re.compile(r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|g|kg)\b', re.I)
    _P=re.compile(r'(?:pack\s*of|x|×)\s*(\d{1,3})', re.I)
    def build_meta(df):
        P,Q,L,D,ATL,B=[],[],[],[],[],[]
        for s in df[TEXT_COL].fillna("").astype(str):
            s_l=s.lower()
            pack=1; m=_P.search(s_l)
            if m:
                try: pack=int(m.group(1))
                except: pack=1
            qty=0.0
            for q,u in _Q.findall(s_l):
                q=float(q); u=u.lower()
                qty += q*(1000.0 if u in ("kg","l","liter","litre") else 1.0)
            n=len(s_l); sp=s_l.count(" "); dg=len(re.findall(r'\d+', s_l))
            P.append(np.log1p(pack)); Q.append(np.log1p(qty)); L.append(np.log1p(n)); D.append(np.log1p(dg)); ATL.append(np.log1p(n/(sp+1)))
        return np.column_stack([P,Q,L,D,ATL]).astype(np.float32)
    M_tr = build_meta(train); M_te = build_meta(test)

Xtr_dense = np.hstack([Xtr_svd, M_tr])
Xte_dense = np.hstack([Xte_svd, M_te])

y_price = train["price"].values.astype(float)
y_log = np.log1p(np.maximum(y_price, 1e-6))

params = dict(
    objective="reg:squarederror", eval_metric="mae",
    learning_rate=0.045, max_depth=10, subsample=0.9, colsample_bytree=0.9,
    reg_lambda=1.5, min_child_weight=5, gamma=1.0, n_estimators=8000, verbosity=0
)
if version.parse(xgb.__version__) >= version.parse("2.0.0"):
    params.update(device="cuda", tree_method="hist")
else:
    params.update(tree_method="gpu_hist", predictor="gpu_predictor")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log_gbm = np.zeros(len(train)); gbm_test_log = np.zeros(len(test))
for tr_idx, va_idx in kf.split(Xtr_dense):
    dtr = xgb.DMatrix(Xtr_dense[tr_idx], label=y_log[tr_idx])
    dva = xgb.DMatrix(Xtr_dense[va_idx], label=y_log[va_idx])
    dte = xgb.DMatrix(Xte_dense)
    mdl = xgb.train(params, dtr, num_boost_round=params["n_estimators"],
                    evals=[(dva,"valid")], early_stopping_rounds=300, verbose_eval=False)
    oof_log_gbm[va_idx] = mdl.predict(dva, iteration_range=(0, mdl.best_iteration+1))
    gbm_test_log += mdl.predict(dte, iteration_range=(0, mdl.best_iteration+1))
gbm_test_log /= 5
dense_test_pred = np.expm1(gbm_test_log)
print("Dense(no-img) OOF SMAPE:", smape(y_price, np.expm1(oof_log_gbm)))


In [None]:
# ==== Text model: TF-IDF (word+char) + SGDRegressor(loss='huber') on log(price) ====
import numpy as np, pandas as pd, re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold
from scipy.sparse import hstack

def smape(y_true,y_pred):
    y_true=np.asarray(y_true,float); y_pred=np.asarray(y_pred,float)
    denom=(np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom,1e-9,None))*100
def to_log(y):  return np.log1p(np.maximum(y, 1e-6))
def from_log(y): return np.expm1(y)

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
tr_txt = train[TEXT_COL].fillna("").astype(str)
te_txt = test[TEXT_COL].fillna("").astype(str)

vec_w = TfidfVectorizer(ngram_range=(1,3), max_features=350_000, min_df=3, max_df=0.97, sublinear_tf=True)
vec_c = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=300_000, min_df=3, sublinear_tf=True)
Xw_tr = vec_w.fit_transform(tr_txt); Xw_te = vec_w.transform(te_txt)
Xc_tr = vec_c.fit_transform(tr_txt); Xc_te = vec_c.transform(te_txt)
Xtr = hstack([Xw_tr, Xc_tr], format="csr"); Xte = hstack([Xw_te, Xc_te], format="csr")

y_log = to_log(train["price"].values)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_log_text = np.zeros(len(train)); text_test_log = np.zeros(len(test))
for tr_idx, va_idx in kf.split(Xtr):
    mdl = SGDRegressor(
        loss="huber", epsilon=0.1, alpha=1e-5, penalty="l2",
        learning_rate="optimal", max_iter=2000, tol=1e-4, random_state=42
    )
    mdl.fit(Xtr[tr_idx], y_log[tr_idx])
    oof_log_text[va_idx] = mdl.predict(Xtr[va_idx])
    text_test_log += mdl.predict(Xte)
text_test_log /= 5
text_test_pred = from_log(text_test_log)
print("Text(Huber-SGD) OOF SMAPE:", smape(train["price"].values, from_log(oof_log_text)))


In [None]:
# ==== Simple re-blend (OOF-tuned) + write ====
import numpy as np, pandas as pd
from pathlib import Path

def smape(y_true,y_pred):
    y_true=np.asarray(y_true,float); y_pred=np.asarray(y_pred,float)
    denom=(np.abs(y_true)+np.abs(y_pred))/2
    return np.mean(np.abs(y_true-y_pred)/np.clip(denom,1e-9,None))*100

oof_text  = np.expm1(oof_log_text)
oof_dense = np.expm1(oof_log_gbm)
cands=[]; 
for wt in np.linspace(0.55,0.95,9):
    for wd in np.linspace(0.05,0.45,9):
        if wt+wd<=1.0:
            pred = wt*oof_text + wd*oof_dense
            cands.append((smape(train["price"].values, pred), wt, wd))
best = min(cands, key=lambda x: x[0])
best_smape, WT, WD = best
print(f"Blend OOF SMAPE={best_smape:.4f} | Wt={WT:.3f}, Wd={WD:.3f}")

final = WT*text_test_pred + WD*dense_test_pred
out = pd.DataFrame({"sample_id": test["sample_id"], "price": np.clip(final, 1e-6, None)})
from pathlib import Path; ART=Path("artifacts"); ART.mkdir(exist_ok=True, parents=True)
out.to_csv(ART/"test_out_final.csv", index=False, float_format="%.6f")
print("✅ test_out_final.csv:", (ART/"test_out_final.csv").resolve())
