In [8]:
# Minimal cleaner (only if catalog_content_clean doesn't exist yet)
if "catalog_content_clean" not in locals() and "catalog_content_clean" not in train.columns if 'train' in locals() else [False]:
    import re
    def clean_text(s) -> str:
        s = "" if s is None else str(s)
        s = s.lower()
        s = re.sub(r"http\S+|www\S+|https\S+", "", s)
        s = re.sub(r"[^a-z0-9 ]+", " ", s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()


In [2]:
# Robust project-root + data loader for 02_stage2.ipynb

import re, gc, math, numpy as np, pandas as pd
from pathlib import Path

def find_project_root():
    """Walk upward until we find a folder that actually contains data/train.csv (or common markers)."""
    p = Path.cwd()
    for _ in range(8):
        # Preferred: explicit presence of data files
        if (p / "data" / "train.csv").exists() and (p / "data" / "test.csv").exists():
            return p
        # Fallback: repo markers + data dir exists
        if any((p / m).exists() for m in ("requirements.txt", ".git", "HANDOFF.md")) and (p / "data").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not locate project root containing a 'data/' folder.")

ROOT = find_project_root()
DATA = ROOT / "data"
ART  = ROOT / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

print("Notebook CWD :", Path.cwd())
print("Project ROOT :", ROOT)
print("DATA exists? :", DATA.exists(), "|", (DATA / "train.csv").exists(), (DATA / "test.csv").exists())
print("ART path     :", ART)

# Load data
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

# Choose text column (use cleaned if present, else raw)
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Safe string views for modeling
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values


Notebook CWD : d:\amazon ML challenge\notebooks
Project ROOT : d:\amazon ML challenge
DATA exists? : True | True True
ART path     : d:\amazon ML challenge\artifacts


In [3]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import pandas as pd

identity = FunctionTransformer(lambda s: s, validate=False)

features = FeatureUnion([
    ("word", Pipeline([
        ("id", identity),
        ("tfidf", TfidfVectorizer(
            analyzer="word", ngram_range=(1,2),
            max_features=350_000, min_df=2
        ))
    ])),
    ("char", Pipeline([
        ("id", identity),
        ("tfidf", TfidfVectorizer(
            analyzer="char_wb", ngram_range=(3,6),
            max_features=300_000, min_df=2
        ))
    ])),
], n_jobs=1)

ridge = Ridge(alpha=1.2, random_state=42)

pipe_wc = Pipeline([
    ("features", features),
    ("ridge", ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
maes = []
for tr, va in cv.split(X_text_tr):
    pipe_wc.fit(X_text_tr.iloc[tr], y[tr])
    pred = pipe_wc.predict(X_text_tr.iloc[va])
    maes.append(mean_absolute_error(y[va], pred))
print("Word+Char TF-IDF Ridge 5-fold MAE:", np.mean(maes), "±", np.std(maes))

# Fit full and write submission
pipe_wc.fit(X_text_tr, y)
pred_wc = pipe_wc.predict(X_text_te)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc}).to_csv(ART / "submission_wc.csv", index=False)
print("Saved:", (ART / "submission_wc.csv").resolve())

Word+Char TF-IDF Ridge 5-fold MAE: 13.855923438301284 ± 0.11941516491554557
Saved: D:\amazon ML challenge\artifacts\submission_wc.csv


In [8]:
# === Stage-2B minimal: word TF-IDF + cheap meta features → pred_tm + save CSV ===
from pathlib import Path
import re, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

# Resolve paths and data (works whether you're in notebooks/ or project root)
ROOT = globals().get("ROOT", Path.cwd())
ROOT = ROOT if isinstance(ROOT, Path) else Path(ROOT)
DATA = Path(globals().get("DATA", ROOT / "data"))
ART  = Path(globals().get("ART",  ROOT / "artifacts"))
ART.mkdir(parents=True, exist_ok=True)

# Load data if not already present
if "train" not in globals() or "test" not in globals():
    train = pd.read_csv(DATA / "train.csv")
    test  = pd.read_csv(DATA / "test.csv")

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values

# ---- meta feature extractor ----
def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)

    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)

    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)

    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)

    df = pd.DataFrame({
        "num_count": num_count,
        "max_num": max_num,
        "min_num": min_num,
        "pack_n":  pack_n,
    })
    df = pd.concat([df, unit_df], axis=1).fillna(0.0)
    return df

meta_tr = extract_meta(X_text_tr)
meta_te = extract_meta(X_text_te)

train_aug = pd.DataFrame({TEXT_COL: X_text_tr})
test_aug  = pd.DataFrame({TEXT_COL: X_text_te})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

numeric_cols = meta_tr.columns.tolist()

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

model = Ridge(alpha=1.0, random_state=42)
pipe_tm = Pipeline([("ct", ct), ("ridge", model)])

# Fit full + predict
pipe_tm.fit(train_aug, y)
pred_tm = pipe_tm.predict(test_aug)

# Save submission for blending
out_tm = ART / "submission_word_meta.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_tm}).to_csv(out_tm, index=False)
print("Saved:", out_tm.resolve(), "| rows:", len(pred_tm))


Saved: D:\amazon ML challenge\artifacts\submission_word_meta.csv | rows: 75000


In [9]:
# Simple average of the two Stage-2 submissions
import pandas as pd
from pathlib import Path

ART = Path(ART)  # reuse from earlier if defined
df_wc = pd.read_csv(ART / "submission_wc.csv")
df_tm = pd.read_csv(ART / "submission_word_meta.csv")

df = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc", "_tm"))
df["price"] = 0.5*df["price_wc"] + 0.5*df["price_tm"]
out = ART / "submission_ensemble_v1.csv"
df[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(df))


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v1.csv | rows: 75000


In [13]:
from pathlib import Path
import pandas as pd

# Reuse ART if defined; else resolve it from project root
ART = Path(globals().get("ART", Path.cwd() / "artifacts"))

print("ART =", ART.resolve())
print("Available submissions:", [p.name for p in ART.glob("submission_*.csv")])

df = pd.read_csv(ART / "submission_ensemble_v1.csv")
assert len(df)==75000 and df["sample_id"].is_unique and not df["price"].isna().any()
print(df["price"].min(), df["price"].max(), df["price"].median())


ART = D:\amazon ML challenge\artifacts
Available submissions: ['submission_ensemble_v1.csv', 'submission_final.csv', 'submission_wc.csv', 'submission_word_meta.csv']
-57.24441578355 427.01332224186393 19.36302922582783


In [16]:
from pathlib import Path
import pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge

ART = Path(globals().get("ART", Path.cwd() / "artifacts"))
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Encode
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
tr_vec = model.encode(train[TEXT_COL].fillna("").tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
te_vec = model.encode(test[TEXT_COL].fillna("").tolist(),  batch_size=512, show_progress_bar=True, convert_to_numpy=True)

# Fit + predict
ridge_emb = Ridge(alpha=1.0, random_state=42).fit(tr_vec, train["price"].values)
pred_emb = ridge_emb.predict(te_vec)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_emb}).to_csv(ART/"submission_sbert.csv", index=False)

# 3-way blend with existing files
df_wc = pd.read_csv(ART/"submission_wc.csv")
df_tm = pd.read_csv(ART/"submission_word_meta.csv")
df_sb = pd.read_csv(ART/"submission_sbert.csv")

m = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc","_tm")).merge(df_sb, on="sample_id")
m["price"] = 0.4*m["price_wc"] + 0.4*m["price_tm"] + 0.2*m["price"]
out = ART/"submission_ensemble_v2.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(m))


Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]
Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v2.csv | rows: 75000


In [18]:
import numpy as np

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return float(np.mean(np.abs(y_true - y_pred) / denom) * 100.0)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
import numpy as np
import pandas as pd

# Reuse your word+char FeatureUnion pipe from Stage-2A
wc_features = pipe_wc.named_steps["features"]  # same vectorizers
base = Ridge(alpha=1.2, random_state=42)

log_ridge = TransformedTargetRegressor(
    regressor=Ridge(alpha=1.2, random_state=42),
    func=np.log1p,
    inverse_func=np.expm1
)

pipe_wc_log = Pipeline([
    ("features", wc_features),
    ("ridge", log_ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_log.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_log.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, log target): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

# Fit full + save submission
pipe_wc_log.fit(X_text_tr, y)
pred_wc_log = pipe_wc_log.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_log}).to_csv(ART/"submission_wc_log.csv", index=False)
print("Saved:", (ART/"submission_wc_log.csv").resolve())


CV SMAPE (word+char, log target): 52.18% ± 0.38%
Saved: D:\amazon ML challenge\artifacts\submission_wc_log.csv


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
import numpy as np
import pandas as pd

# Reuse your word+char FeatureUnion pipe from Stage-2A
wc_features = pipe_wc.named_steps["features"]  # same vectorizers
base = Ridge(alpha=1.2, random_state=42)

log_ridge = TransformedTargetRegressor(
    regressor=Ridge(alpha=1.2, random_state=42),
    func=np.log1p,
    inverse_func=np.expm1
)

pipe_wc_log = Pipeline([
    ("features", wc_features),
    ("ridge", log_ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_log.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_log.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, log target): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

# Fit full + save submission
pipe_wc_log.fit(X_text_tr, y)
pred_wc_log = pipe_wc_log.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_log}).to_csv(ART/"submission_wc_log.csv", index=False)
print("Saved:", (ART/"submission_wc_log.csv").resolve())


CV SMAPE (word+char, log target): 52.18% ± 0.38%
Saved: D:\amazon ML challenge\artifacts\submission_wc_log.csv


In [21]:
from sklearn.linear_model import TweedieRegressor

pipe_wc_gamma = Pipeline([
    ("features", wc_features),
    ("glm", TweedieRegressor(power=2.0, link="log", alpha=1e-3, max_iter=3000, tol=1e-6, warm_start=True)),
])

scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_gamma.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_gamma.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, Gamma GLM): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

pipe_wc_gamma.fit(X_text_tr, y)
pred_wc_gamma = pipe_wc_gamma.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_gamma}).to_csv(ART/"submission_wc_gamma.csv", index=False)
print("Saved:", (ART/"submission_wc_gamma.csv").resolve())


CV SMAPE (word+char, Gamma GLM): 64.09% ± 0.47%
Saved: D:\amazon ML challenge\artifacts\submission_wc_gamma.csv


In [22]:
import pandas as pd

cands = []
for fname in ["submission_wc.csv", "submission_wc_log.csv", "submission_wc_gamma.csv"]:
    p = (ART/fname)
    if p.exists(): cands.append(pd.read_csv(p).rename(columns={"price": fname.replace(".csv","")}))

m = cands[0]
for df in cands[1:]:
    m = m.merge(df, on="sample_id")

# If both log and gamma exist, try a 60/40 toward the lower CV SMAPE
cols = [c for c in m.columns if c!="sample_id"]
if "submission_wc_log" in cols and "submission_wc_gamma" in cols:
    m["price"] = 0.6*m["submission_wc_log"] + 0.4*m["submission_wc_gamma"]
elif "submission_wc_log" in cols:
    m["price"] = m["submission_wc_log"]
else:
    m["price"] = m[cols[0]]

out = ART/"submission_wc_smape_opt.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve())


Saved: D:\amazon ML challenge\artifacts\submission_wc_smape_opt.csv


In [23]:
import numpy as np
bins = np.clip(np.floor(np.log1p(y)), 0, 10).astype(int)
# If you want to keep KFold, keep the seed and ensure each fold has distribution checked
# Or use StratifiedKFold on bins (regression hack):
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [25]:
# Make the final, portal-ready file from the log-target model
from pathlib import Path
import pandas as pd, numpy as np

ART, DATA = Path(ART), Path(DATA)

df = pd.read_csv(ART/"submission_wc_log.csv")[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

# align to test.csv order (defensive)
test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [26]:
# === Build OOF predictions for two models, tune weights for SMAPE, blend test preds ===
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler

ART, DATA = Path(ART), Path(DATA)

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred)+eps)/2.0
    return float(np.mean(np.abs(y_true-y_pred)/denom)*100.0)

# Rebuild the two models (same configs you used)
identity = FunctionTransformer(lambda s: s, validate=False)

# word+char features (same as Stage-2A)
wc_features = Pipeline([
    ("union", 
     Pipeline(steps=[("id", identity)])),  # placeholder so we can set params next line
])
# We'll attach vectorizers directly via FeatureUnion-like manual mapping:
from sklearn.pipeline import FeatureUnion
wc_features = FeatureUnion([
    ("word", Pipeline([("id", identity), ("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=350_000, min_df=2))])),
    ("char", Pipeline([("id", identity), ("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=300_000, min_df=2))]))
], n_jobs=1)

wc_log = Pipeline([
    ("features", wc_features),
    ("ridge_log", TransformedTargetRegressor(
        regressor=Ridge(alpha=1.2, random_state=42),
        func=np.log1p, inverse_func=np.expm1))
])

# word TF-IDF + meta (same as Stage-2B)
numeric_cols = ["num_count","max_num","min_num","pack_n","vol_ml","wt_g"]

def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)
    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)
    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)
    df = pd.DataFrame({"num_count": num_count, "max_num": max_num, "min_num": min_num, "pack_n": pack_n})
    return pd.concat([df, unit_df], axis=1).fillna(0.0)

train_text = X_text_tr
test_text  = X_text_te
y_vec = y

meta_tr = extract_meta(train_text)
meta_te = extract_meta(test_text)

train_aug = pd.DataFrame({TEXT_COL: train_text})
test_aug  = pd.DataFrame({TEXT_COL: test_text})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

word_meta = Pipeline([("ct", ct), ("ridge", Ridge(alpha=1.0, random_state=42))])

# OOF predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log = np.zeros(len(train_aug))
oof_tm  = np.zeros(len(train_aug))

for tr_idx, va_idx in cv.split(train_aug):
    wc_log.fit(train_text.iloc[tr_idx], y_vec[tr_idx])
    oof_log[va_idx] = wc_log.predict(train_text.iloc[va_idx]).clip(min=1e-6)

    word_meta.fit(train_aug.iloc[tr_idx], y_vec[tr_idx])
    oof_tm[va_idx] = word_meta.predict(train_aug.iloc[va_idx]).clip(min=1e-6)

print("OOF SMAPE — log:", smape(y_vec, oof_log), "| tm:", smape(y_vec, oof_tm))

# Grid search weights to minimize SMAPE on OOF
best_w, best_s = None, 1e9
for w in np.linspace(0, 1, 21):  # 0.00 .. 1.00 step 0.05
    blend = w*oof_log + (1-w)*oof_tm
    s = smape(y_vec, blend)
    if s < best_s:
        best_s, best_w = s, w
print(f"Best OOF SMAPE: {best_s:.2f}% at w_log={best_w:.2f}, w_tm={1-best_w:.2f}")

# Fit both on full train and blend test with tuned weights
wc_log.fit(train_text, y_vec)
p_log = wc_log.predict(test_text).clip(min=1e-6)

word_meta.fit(train_aug, y_vec)
p_tm = word_meta.predict(test_aug).clip(min=1e-6)

p_blend = best_w*p_log + (1-best_w)*p_tm

# Save SMAPE-optimized blend
out = ART/"submission_wc_smape_blend.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": p_blend}).to_csv(out, index=False)
print("Saved:", out.resolve())


OOF SMAPE — log: 52.18425790496487 | tm: 66.27008136434968
Best OOF SMAPE: 52.18% at w_log=1.00, w_tm=0.00
Saved: D:\amazon ML challenge\artifacts\submission_wc_smape_blend.csv


In [27]:
import pandas as pd
from pathlib import Path

ART, DATA = Path(ART), Path(DATA)
best = "submission_wc_smape_blend.csv"  # or "submission_wc_log.csv" if you prefer

df = pd.read_csv(ART/best)[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [28]:
from pathlib import Path
import pandas as pd
ART, DATA = Path(ART), Path(DATA)

df = pd.read_csv(ART/"submission_wc_log.csv")[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all(), "Missing predictions"

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [17]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    # Avoid divisions by 0 (when both true and pred are 0)
    mask = denom != 0
    out = np.zeros_like(denom)
    out[mask] = diff[mask] / denom[mask]
    return np.mean(out) * 100.0

# Example: evaluate smape for the word+char pipeline 'pipe_wc'
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc.predict(X_text_tr.iloc[va])
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char ridge): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")


CV SMAPE (word+char ridge): 68.12% ± 0.19%
