In [2]:
# Minimal cleaner (only if catalog_content_clean doesn't exist yet)
if "catalog_content_clean" not in locals() and "catalog_content_clean" not in train.columns if 'train' in locals() else [False]:
    import re
    def clean_text(s) -> str:
        s = "" if s is None else str(s)
        s = s.lower()
        s = re.sub(r"http\S+|www\S+|https\S+", "", s)
        s = re.sub(r"[^a-z0-9 ]+", " ", s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()


In [3]:
# Robust project-root + data loader for 02_stage2.ipynb

import re, gc, math, numpy as np, pandas as pd
from pathlib import Path

def find_project_root():
    """Walk upward until we find a folder that actually contains data/train.csv (or common markers)."""
    p = Path.cwd()
    for _ in range(8):
        # Preferred: explicit presence of data files
        if (p / "data" / "train.csv").exists() and (p / "data" / "test.csv").exists():
            return p
        # Fallback: repo markers + data dir exists
        if any((p / m).exists() for m in ("requirements.txt", ".git", "HANDOFF.md")) and (p / "data").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not locate project root containing a 'data/' folder.")

ROOT = find_project_root()
DATA = ROOT / "data"
ART  = ROOT / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

print("Notebook CWD :", Path.cwd())
print("Project ROOT :", ROOT)
print("DATA exists? :", DATA.exists(), "|", (DATA / "train.csv").exists(), (DATA / "test.csv").exists())
print("ART path     :", ART)

# Load data
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

# Choose text column (use cleaned if present, else raw)
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Safe string views for modeling
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values


Notebook CWD : d:\amazon ML challenge\notebooks
Project ROOT : d:\amazon ML challenge
DATA exists? : True | True True
ART path     : d:\amazon ML challenge\artifacts


In [8]:
# === Stage-2B minimal: word TF-IDF + cheap meta features → pred_tm + save CSV ===
from pathlib import Path
import re, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

# Resolve paths and data (works whether you're in notebooks/ or project root)
ROOT = globals().get("ROOT", Path.cwd())
ROOT = ROOT if isinstance(ROOT, Path) else Path(ROOT)
DATA = Path(globals().get("DATA", ROOT / "data"))
ART  = Path(globals().get("ART",  ROOT / "artifacts"))
ART.mkdir(parents=True, exist_ok=True)

# Load data if not already present
if "train" not in globals() or "test" not in globals():
    train = pd.read_csv(DATA / "train.csv")
    test  = pd.read_csv(DATA / "test.csv")

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values

# ---- meta feature extractor ----
def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)

    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)

    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)

    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)

    df = pd.DataFrame({
        "num_count": num_count,
        "max_num": max_num,
        "min_num": min_num,
        "pack_n":  pack_n,
    })
    df = pd.concat([df, unit_df], axis=1).fillna(0.0)
    return df

meta_tr = extract_meta(X_text_tr)
meta_te = extract_meta(X_text_te)

train_aug = pd.DataFrame({TEXT_COL: X_text_tr})
test_aug  = pd.DataFrame({TEXT_COL: X_text_te})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

numeric_cols = meta_tr.columns.tolist()

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

model = Ridge(alpha=1.0, random_state=42)
pipe_tm = Pipeline([("ct", ct), ("ridge", model)])

# Fit full + predict
pipe_tm.fit(train_aug, y)
pred_tm = pipe_tm.predict(test_aug)

# Save submission for blending
out_tm = ART / "submission_word_meta.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_tm}).to_csv(out_tm, index=False)
print("Saved:", out_tm.resolve(), "| rows:", len(pred_tm))


Saved: D:\amazon ML challenge\artifacts\submission_word_meta.csv | rows: 75000


In [9]:
# Simple average of the two Stage-2 submissions
import pandas as pd
from pathlib import Path

ART = Path(ART)  # reuse from earlier if defined
df_wc = pd.read_csv(ART / "submission_wc.csv")
df_tm = pd.read_csv(ART / "submission_word_meta.csv")

df = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc", "_tm"))
df["price"] = 0.5*df["price_wc"] + 0.5*df["price_tm"]
out = ART / "submission_ensemble_v1.csv"
df[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(df))


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v1.csv | rows: 75000


In [13]:
from pathlib import Path
import pandas as pd

# Reuse ART if defined; else resolve it from project root
ART = Path(globals().get("ART", Path.cwd() / "artifacts"))

print("ART =", ART.resolve())
print("Available submissions:", [p.name for p in ART.glob("submission_*.csv")])

df = pd.read_csv(ART / "submission_ensemble_v1.csv")
assert len(df)==75000 and df["sample_id"].is_unique and not df["price"].isna().any()
print(df["price"].min(), df["price"].max(), df["price"].median())


ART = D:\amazon ML challenge\artifacts
Available submissions: ['submission_ensemble_v1.csv', 'submission_final.csv', 'submission_wc.csv', 'submission_word_meta.csv']
-57.24441578355 427.01332224186393 19.36302922582783


In [16]:
from pathlib import Path
import pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge

ART = Path(globals().get("ART", Path.cwd() / "artifacts"))
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Encode
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
tr_vec = model.encode(train[TEXT_COL].fillna("").tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
te_vec = model.encode(test[TEXT_COL].fillna("").tolist(),  batch_size=512, show_progress_bar=True, convert_to_numpy=True)

# Fit + predict
ridge_emb = Ridge(alpha=1.0, random_state=42).fit(tr_vec, train["price"].values)
pred_emb = ridge_emb.predict(te_vec)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_emb}).to_csv(ART/"submission_sbert.csv", index=False)

# 3-way blend with existing files
df_wc = pd.read_csv(ART/"submission_wc.csv")
df_tm = pd.read_csv(ART/"submission_word_meta.csv")
df_sb = pd.read_csv(ART/"submission_sbert.csv")

m = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc","_tm")).merge(df_sb, on="sample_id")
m["price"] = 0.4*m["price_wc"] + 0.4*m["price_tm"] + 0.2*m["price"]
out = ART/"submission_ensemble_v2.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(m))


Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]
Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v2.csv | rows: 75000


In [18]:
import numpy as np

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return float(np.mean(np.abs(y_true - y_pred) / denom) * 100.0)

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
import numpy as np
import pandas as pd

# Reuse your word+char FeatureUnion pipe from Stage-2A
wc_features = pipe_wc.named_steps["features"]  # same vectorizers
base = Ridge(alpha=1.2, random_state=42)

log_ridge = TransformedTargetRegressor(
    regressor=Ridge(alpha=1.2, random_state=42),
    func=np.log1p,
    inverse_func=np.expm1
)

pipe_wc_log = Pipeline([
    ("features", wc_features),
    ("ridge", log_ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_log.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_log.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, log target): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

# Fit full + save submission
pipe_wc_log.fit(X_text_tr, y)
pred_wc_log = pipe_wc_log.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_log}).to_csv(ART/"submission_wc_log.csv", index=False)
print("Saved:", (ART/"submission_wc_log.csv").resolve())


CV SMAPE (word+char, log target): 52.18% ± 0.38%
Saved: D:\amazon ML challenge\artifacts\submission_wc_log.csv


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
import numpy as np
import pandas as pd

# Reuse your word+char FeatureUnion pipe from Stage-2A
wc_features = pipe_wc.named_steps["features"]  # same vectorizers
base = Ridge(alpha=1.2, random_state=42)

log_ridge = TransformedTargetRegressor(
    regressor=Ridge(alpha=1.2, random_state=42),
    func=np.log1p,
    inverse_func=np.expm1
)

pipe_wc_log = Pipeline([
    ("features", wc_features),
    ("ridge", log_ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_log.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_log.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, log target): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

# Fit full + save submission
pipe_wc_log.fit(X_text_tr, y)
pred_wc_log = pipe_wc_log.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_log}).to_csv(ART/"submission_wc_log.csv", index=False)
print("Saved:", (ART/"submission_wc_log.csv").resolve())


CV SMAPE (word+char, log target): 52.18% ± 0.38%
Saved: D:\amazon ML challenge\artifacts\submission_wc_log.csv


In [21]:
from sklearn.linear_model import TweedieRegressor

pipe_wc_gamma = Pipeline([
    ("features", wc_features),
    ("glm", TweedieRegressor(power=2.0, link="log", alpha=1e-3, max_iter=3000, tol=1e-6, warm_start=True)),
])

scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc_gamma.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc_gamma.predict(X_text_tr.iloc[va]).clip(min=1e-6)
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char, Gamma GLM): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")

pipe_wc_gamma.fit(X_text_tr, y)
pred_wc_gamma = pipe_wc_gamma.predict(X_text_te).clip(min=1e-6)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc_gamma}).to_csv(ART/"submission_wc_gamma.csv", index=False)
print("Saved:", (ART/"submission_wc_gamma.csv").resolve())


CV SMAPE (word+char, Gamma GLM): 64.09% ± 0.47%
Saved: D:\amazon ML challenge\artifacts\submission_wc_gamma.csv


In [22]:
import pandas as pd

cands = []
for fname in ["submission_wc.csv", "submission_wc_log.csv", "submission_wc_gamma.csv"]:
    p = (ART/fname)
    if p.exists(): cands.append(pd.read_csv(p).rename(columns={"price": fname.replace(".csv","")}))

m = cands[0]
for df in cands[1:]:
    m = m.merge(df, on="sample_id")

# If both log and gamma exist, try a 60/40 toward the lower CV SMAPE
cols = [c for c in m.columns if c!="sample_id"]
if "submission_wc_log" in cols and "submission_wc_gamma" in cols:
    m["price"] = 0.6*m["submission_wc_log"] + 0.4*m["submission_wc_gamma"]
elif "submission_wc_log" in cols:
    m["price"] = m["submission_wc_log"]
else:
    m["price"] = m[cols[0]]

out = ART/"submission_wc_smape_opt.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve())


Saved: D:\amazon ML challenge\artifacts\submission_wc_smape_opt.csv


In [23]:
import numpy as np
bins = np.clip(np.floor(np.log1p(y)), 0, 10).astype(int)
# If you want to keep KFold, keep the seed and ensure each fold has distribution checked
# Or use StratifiedKFold on bins (regression hack):
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [25]:
# Make the final, portal-ready file from the log-target model
from pathlib import Path
import pandas as pd, numpy as np

ART, DATA = Path(ART), Path(DATA)

df = pd.read_csv(ART/"submission_wc_log.csv")[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

# align to test.csv order (defensive)
test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [26]:
# === Build OOF predictions for two models, tune weights for SMAPE, blend test preds ===
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer, StandardScaler

ART, DATA = Path(ART), Path(DATA)

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred)+eps)/2.0
    return float(np.mean(np.abs(y_true-y_pred)/denom)*100.0)

# Rebuild the two models (same configs you used)
identity = FunctionTransformer(lambda s: s, validate=False)

# word+char features (same as Stage-2A)
wc_features = Pipeline([
    ("union", 
     Pipeline(steps=[("id", identity)])),  # placeholder so we can set params next line
])
# We'll attach vectorizers directly via FeatureUnion-like manual mapping:
from sklearn.pipeline import FeatureUnion
wc_features = FeatureUnion([
    ("word", Pipeline([("id", identity), ("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=350_000, min_df=2))])),
    ("char", Pipeline([("id", identity), ("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=300_000, min_df=2))]))
], n_jobs=1)

wc_log = Pipeline([
    ("features", wc_features),
    ("ridge_log", TransformedTargetRegressor(
        regressor=Ridge(alpha=1.2, random_state=42),
        func=np.log1p, inverse_func=np.expm1))
])

# word TF-IDF + meta (same as Stage-2B)
numeric_cols = ["num_count","max_num","min_num","pack_n","vol_ml","wt_g"]

def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)
    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)
    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)
    df = pd.DataFrame({"num_count": num_count, "max_num": max_num, "min_num": min_num, "pack_n": pack_n})
    return pd.concat([df, unit_df], axis=1).fillna(0.0)

train_text = X_text_tr
test_text  = X_text_te
y_vec = y

meta_tr = extract_meta(train_text)
meta_te = extract_meta(test_text)

train_aug = pd.DataFrame({TEXT_COL: train_text})
test_aug  = pd.DataFrame({TEXT_COL: test_text})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

word_meta = Pipeline([("ct", ct), ("ridge", Ridge(alpha=1.0, random_state=42))])

# OOF predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
oof_log = np.zeros(len(train_aug))
oof_tm  = np.zeros(len(train_aug))

for tr_idx, va_idx in cv.split(train_aug):
    wc_log.fit(train_text.iloc[tr_idx], y_vec[tr_idx])
    oof_log[va_idx] = wc_log.predict(train_text.iloc[va_idx]).clip(min=1e-6)

    word_meta.fit(train_aug.iloc[tr_idx], y_vec[tr_idx])
    oof_tm[va_idx] = word_meta.predict(train_aug.iloc[va_idx]).clip(min=1e-6)

print("OOF SMAPE — log:", smape(y_vec, oof_log), "| tm:", smape(y_vec, oof_tm))

# Grid search weights to minimize SMAPE on OOF
best_w, best_s = None, 1e9
for w in np.linspace(0, 1, 21):  # 0.00 .. 1.00 step 0.05
    blend = w*oof_log + (1-w)*oof_tm
    s = smape(y_vec, blend)
    if s < best_s:
        best_s, best_w = s, w
print(f"Best OOF SMAPE: {best_s:.2f}% at w_log={best_w:.2f}, w_tm={1-best_w:.2f}")

# Fit both on full train and blend test with tuned weights
wc_log.fit(train_text, y_vec)
p_log = wc_log.predict(test_text).clip(min=1e-6)

word_meta.fit(train_aug, y_vec)
p_tm = word_meta.predict(test_aug).clip(min=1e-6)

p_blend = best_w*p_log + (1-best_w)*p_tm

# Save SMAPE-optimized blend
out = ART/"submission_wc_smape_blend.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": p_blend}).to_csv(out, index=False)
print("Saved:", out.resolve())


OOF SMAPE — log: 52.18425790496487 | tm: 66.27008136434968
Best OOF SMAPE: 52.18% at w_log=1.00, w_tm=0.00
Saved: D:\amazon ML challenge\artifacts\submission_wc_smape_blend.csv


In [27]:
import pandas as pd
from pathlib import Path

ART, DATA = Path(ART), Path(DATA)
best = "submission_wc_smape_blend.csv"  # or "submission_wc_log.csv" if you prefer

df = pd.read_csv(ART/best)[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [28]:
from pathlib import Path
import pandas as pd
ART, DATA = Path(ART), Path(DATA)

df = pd.read_csv(ART/"submission_wc_log.csv")[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

test_ids = pd.read_csv(DATA/"test.csv")["sample_id"]
final = test_ids.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all(), "Missing predictions"

final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [17]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    # Avoid divisions by 0 (when both true and pred are 0)
    mask = denom != 0
    out = np.zeros_like(denom)
    out[mask] = diff[mask] / denom[mask]
    return np.mean(out) * 100.0

# Example: evaluate smape for the word+char pipeline 'pipe_wc'
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for tr, va in cv.split(X_text_tr):
    pipe_wc.fit(X_text_tr.iloc[tr], y[tr])
    p = pipe_wc.predict(X_text_tr.iloc[va])
    scores.append(smape(y[va], p))
print(f"CV SMAPE (word+char ridge): {np.mean(scores):.2f}% ± {np.std(scores):.2f}%")


CV SMAPE (word+char ridge): 68.12% ± 0.19%


In [29]:
# Grid search a few strong configs for SMAPE (fast)
import numpy as np
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, float); y_pred = np.asarray(y_pred, float)
    denom = (np.abs(y_true)+np.abs(y_pred)+eps)/2.0
    return float(np.mean(np.abs(y_true-y_pred)/denom)*100.0)

cfgs = [
    # (word_max, char_max, alpha)
    (300_000, 250_000, 1.0),
    (350_000, 300_000, 1.2),
    (400_000, 350_000, 1.2),
    (300_000, 300_000, 1.5),
]
best = (9e9, None)

for wmax,cmax,alpha in cfgs:
    feats = FeatureUnion([
        ("word", Pipeline([("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=wmax, min_df=2))])),
        ("char", Pipeline([("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=cmax, min_df=2))]))
    ], n_jobs=1)
    pipe = Pipeline([
        ("features", feats),
        ("reg", TransformedTargetRegressor(
            regressor=Ridge(alpha=alpha, random_state=42),
            func=np.log1p, inverse_func=np.expm1))
    ])
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores=[]
    for tr,va in cv.split(X_text_tr):
        pipe.fit(X_text_tr.iloc[tr], y[tr])
        p = pipe.predict(X_text_tr.iloc[va]).clip(min=1e-6)
        scores.append(smape(y[va], p))
    m = float(np.mean(scores)); s=float(np.std(scores))
    print((wmax,cmax,alpha), f"{m:.2f} ± {s:.2f}")
    if m < best[0]:
        best = (m,(wmax,cmax,alpha))

best


(300000, 250000, 1.0) 52.20 ± 0.38
(350000, 300000, 1.2) 52.18 ± 0.38
(400000, 350000, 1.2) 52.14 ± 0.38
(300000, 300000, 1.5) 52.28 ± 0.39


(52.14107187166095, (400000, 350000, 1.2))

In [30]:
# Split training by price bins; train one log-target model per bin; predict all test rows with all, then soft-blend by bin probs
import numpy as np, pandas as pd
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor

# bins on log-price
ylog = np.log1p(y)
cuts = np.quantile(ylog, [0, .2, .4, .6, .8, 1.0])
bin_id = np.digitize(ylog, cuts[1:-1], right=False)

# train KNN on meta signals to estimate bin probs at test time (no labels for test)
def extract_light_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    lens = s.str.len()
    nums = s.str.count(r"(?<![a-zA-Z])\d+(?:\.\d+)?")
    upp = s.str.count(r"\b[A-Z]{2,}\b")
    return pd.DataFrame({"len":lens, "nums":nums, "upp":upp}).astype(float)

meta_tr_l = extract_light_meta(X_text_tr)
meta_te_l = extract_light_meta(X_text_te)

# bin classifier proxy using KNN on meta (unsupervised-ish routing)
knn = KNeighborsRegressor(n_neighbors=25, weights="distance")
knn.fit(meta_tr_l, bin_id.astype(float))
bin_pred = knn.predict(meta_te_l)  # continuous; we’ll convert to soft weights
# Softmax over distances not perfect; approximate soft assignment:
# For stability, build one expert per bin and later weight by proximity to each bin center in ylog space
centers = np.array([np.mean(ylog[bin_id==b]) for b in range(5)])

# Train 5 experts (same architecture as best log-target)
wmax,cmax,alpha = best[1]
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor

experts=[]
feats_tpl = lambda: FeatureUnion([
    ("word", Pipeline([("tfidf", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=wmax, min_df=2))])),
    ("char", Pipeline([("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6), max_features=cmax, min_df=2))]))
], n_jobs=1)

for b in range(5):
    mask = (bin_id==b)
    reg = TransformedTargetRegressor(Ridge(alpha=alpha, random_state=42), func=np.log1p, inverse_func=np.expm1)
    pipe = Pipeline([("features", feats_tpl()), ("reg", reg)])
    pipe.fit(X_text_tr[mask], y[mask])
    experts.append(pipe)

# Blend experts by proximity in meta space -> use knn output to map to center weights
# Map predicted continuous bin value to weights over 5 centers using RBF-like kernel
def soft_weights(x, centers, tau=0.8):
    # x ~ predicted bin position (0..4); centers -> 0..4
    d = np.abs(centers - x)  # but centers are in ylog; map x to center index scale:
    # better: linearly map x in [0,4] to centers’ indices:
    idx_pos = np.clip(x, 0, 4)
    d = np.abs(np.arange(5) - idx_pos)
    w = np.exp(-d/tau)
    return w / w.sum()

# Predict per expert then soft-blend
preds = []
for pipe in experts:
    preds.append(pipe.predict(X_text_te).clip(min=1e-6))
preds = np.vstack(preds)  # [5, N]

weights = np.vstack([soft_weights(v, centers) for v in np.clip(bin_pred,0,4)])
p_moe = (weights * preds.T).sum(axis=1)

pd.DataFrame({"sample_id": test["sample_id"], "price": p_moe}).to_csv(ART/"submission_wc_log_moe.csv", index=False)
print("Saved:", (ART/"submission_wc_log_moe.csv").resolve())


Saved: D:\amazon ML challenge\artifacts\submission_wc_log_moe.csv


In [31]:
# Better meta features directly from text (brand-ish tokens, pack math, normalized units)
import re

def rich_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    # brand-ish: first token before '-' or '|' or '—' or ':' or '('
    head = s.str.extract(r"^\s*([A-Za-z0-9&'\./]+)", expand=False).fillna("")
    brand_len = head.str.len()
    brand_caps = head.str.contains(r"[A-Z]").astype(int)

    # quantities
    nums = s.str.findall(r"(?<![A-Za-z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else 0.0).astype(float)

    # pack math: “pack of K” or “K pack” and multiply if quantities likely per item
    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), 1), axis=1).astype(float)

    # units normalize
    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm(pairs):
        ml = g = 0.0
        for val,u in pairs:
            v=float(val)
            if u=="ml": ml+=v
            elif u=="l": ml+=v*1000
            elif u=="oz": g+=v*28.3495
            elif u=="g": g+=v
            elif u=="kg": g+=v*1000
            elif u=="lb": g+=v*453.592
        return pd.Series({"vol_ml": ml, "wt_g": g})
    unit_df = uw.apply(norm)

    # total content per pack (proxy)
    total_ml = unit_df["vol_ml"]*pack_n
    total_g  = unit_df["wt_g"]*pack_n

    return pd.DataFrame({
        "brand_len":brand_len.astype(float),
        "brand_caps":brand_caps.astype(float),
        "num_count":num_count,
        "max_num":max_num,
        "pack_n":pack_n,
        "vol_ml":unit_df["vol_ml"].astype(float),
        "wt_g": unit_df["wt_g"].astype(float),
        "total_ml": total_ml.astype(float),
        "total_g": total_g.astype(float),
    }).fillna(0.0)

meta_tr2 = rich_meta(X_text_tr)
meta_te2 = rich_meta(X_text_te)


In [33]:
import torch
assert torch.cuda.is_available(), "CUDA is not available. (It was earlier—did the venv change?)"
print(torch.__version__, torch.cuda.get_device_name(0))


2.6.0+cu124 NVIDIA GeForce RTX 4070 Laptop GPU


In [None]:
from sentence_transformers import SentenceTransformer
from pathlib import Path
from PIL import Image
import time

device = "cuda"
model = SentenceTransformer("clip-ViT-B-32", device=device)

IMG_DIR = Path("images_dl")
sample_ids = test["sample_id"].head(512).tolist()
imgs = [Image.open(IMG_DIR/f"{sid}.jpg").convert("RGB") for sid in sample_ids]

t0 = time.time()
_ = model.encode(imgs, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
dt = time.time()-t0
print(f"OK: {len(imgs)/dt:.1f} img/s on GPU")


In [36]:
# GPU + image sanity + timed mini-encode (no guessing)
import sys, time
from pathlib import Path
from PIL import Image, UnidentifiedImageError
import torch
from sentence_transformers import SentenceTransformer

# 0) Confirm GPU
print("CUDA available:", torch.cuda.is_available())
if not torch.cuda.is_available():
    raise SystemExit("CUDA not available — stop here.")

device = "cuda"
print("GPU:", torch.cuda.get_device_name(0))
print("Torch:", torch.__version__)

# 1) Point to the ACTUAL images folder used when you downloaded them
#    Change this if you used a different folder (e.g., 'images' or 'data/images')
IMG_DIR = Path("images_dl")  # <-- tweak if needed
print("IMG_DIR:", IMG_DIR.resolve())

# 2) Collect the first 512 test ids and check how many image files exist
ids = test["sample_id"].head(512).tolist()
paths = [IMG_DIR / f"{sid}.jpg" for sid in ids]
exist_mask = [p.exists() for p in paths]
num_exist = sum(exist_mask)
print(f"Found {num_exist}/512 image files")

# If very few exist, you’re pointing to the wrong folder. Show a couple examples:
if num_exist < 32:
    print("Example expected path:", paths[0])
    raise SystemExit("Too few images found in IMG_DIR. Fix IMG_DIR and rerun.")

# 3) Load a small, existing batch (fast) and time it
kept = [p for p, ok in zip(paths, exist_mask) if ok][:128]  # 128 is enough to measure
imgs = []
for p in kept:
    try:
        imgs.append(Image.open(p).convert("RGB"))
    except (UnidentifiedImageError, FileNotFoundError) as e:
        # skip bad files
        pass

print(f"Encoding {len(imgs)} images on GPU …", flush=True)

model = SentenceTransformer("clip-ViT-B-32", device=device)
print("ST target device:", getattr(model, "_target_device", None))

t0 = time.perf_counter()
_ = model.encode(
    imgs,
    batch_size=64,                # increase to 96/128 if VRAM allows
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
dt = time.perf_counter() - t0
print(f"OK: {len(imgs)/dt:.1f} img/s on GPU (batch=64, n={len(imgs)})", flush=True)


CUDA available: True
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
Torch: 2.6.0+cu124
IMG_DIR: D:\amazon ML challenge\notebooks\images_dl
Found 0/512 image files
Example expected path: images_dl\100179.jpg


SystemExit: Too few images found in IMG_DIR. Fix IMG_DIR and rerun.

In [37]:
from pathlib import Path

# Try to infer project root the same way we've been doing
def find_root(markers=("requirements.txt", ".git", "data", "HANDOFF.md")):
    p = Path.cwd()
    for _ in range(8):
        if any((p / m).exists() for m in markers):
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
candidates = [
    ROOT/"images_dl",
    ROOT/"images",
    ROOT/"data"/"images",
    ROOT/"notebooks"/"images_dl",
    Path("images_dl"),
    Path("images"),
]

def jpg_count(d: Path) -> int:
    try:
        return sum(1 for _ in d.glob("*.jpg"))
    except Exception:
        return 0

found = [(d, jpg_count(d)) for d in candidates if d.exists()]
found = sorted(found, key=lambda x: x[1], reverse=True)

print("ROOT:", ROOT)
print("Candidates (path, .jpg count):")
for d,c in found:
    print(" ", d, c)

IMG_DIR = None
if found and found[0][1] >= 100:  # heuristic: at least 100 jpgs
    IMG_DIR = found[0][0]
    print("\nSelected IMG_DIR:", IMG_DIR.resolve())
else:
    print("\nNo suitable images folder found in common locations.")


ROOT: d:\amazon ML challenge
Candidates (path, .jpg count):
  d:\amazon ML challenge\images_dl 146588
  d:\amazon ML challenge\images 110

Selected IMG_DIR: D:\amazon ML challenge\images_dl


In [38]:
from PIL import Image, UnidentifiedImageError
from sentence_transformers import SentenceTransformer
import torch, time

assert IMG_DIR is not None and IMG_DIR.exists(), "Set IMG_DIR to the correct folder path."

ids = test["sample_id"].head(512).tolist()
paths = [IMG_DIR / f"{sid}.jpg" for sid in ids]
ok = [p.exists() for p in paths]
print(f"Found {sum(ok)}/512 images in:", IMG_DIR)

imgs = []
for p in [pp for pp,flag in zip(paths,ok) if flag][:128]:
    try:
        imgs.append(Image.open(p).convert("RGB"))
    except (UnidentifiedImageError, FileNotFoundError):
        pass

device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("clip-ViT-B-32", device=device)
print("Using:", device, "| ST target device:", getattr(model, "_target_device", None))

t0 = time.perf_counter()
_ = model.encode(imgs, batch_size=64, show_progress_bar=True,
                 convert_to_numpy=True, normalize_embeddings=True)
dt = time.perf_counter() - t0
print(f"OK: {len(imgs)/dt:.1f} img/s on GPU (n={len(imgs)})")


Found 512/512 images in: d:\amazon ML challenge\images_dl


`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


Using: cuda | ST target device: cuda:0


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.23s/it]

OK: 28.7 img/s on GPU (n=128)





In [None]:
# Bin-wise calibration for SMAPE
bins = np.clip(np.floor(np.log1p(y)), 0, 10).astype(int)
pred_oof = o1  # use your best model's OOF (log-tuned)

cal = {}
for b in np.unique(bins):
    mask = (bins==b)
    # simple 1D search over scaling factor
    best_k, best_s = 1.0, 1e9
    for k in np.linspace(0.8, 1.2, 41):
        s = smape(y[mask], np.clip(pred_oof[mask]*k, 1e-6, None))
        if s < best_s: best_s, best_k = s, k
    cal[b] = best_k

# apply to test based on predicted bin (use the KNN-lite bin proxy from step 2 or simpler: use text length proxies)
test_bins = np.clip(np.floor(np.log1p(np.median(y))), 0, 10).astype(int)  # fallback single bin
# If you built 'p_final' above and have no per-row bins, skip or set single multiplier ~1.0

# Example applying a single calibrated multiplier:
# p_final *= cal.get(test_bins, 1.0)


In [None]:
# === GPU image embeddings: root->data->images->throughput->cached full encode ===
import time, sys, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image, UnidentifiedImageError
import torch
from sentence_transformers import SentenceTransformer

# 0) Root + data loader
def find_root(markers=("requirements.txt", ".git", "data", "HANDOFF.md")):
    p = Path.cwd()
    for _ in range(8):
        if any((p / m).exists() for m in markers):
            return p
        p = p.parent
    return Path.cwd()

ROOT = find_root()
DATA = ROOT / "data"
ART  = ROOT / "artifacts"
ART.mkdir(parents=True, exist_ok=True)
EMB  = ART / "emb_cache"
EMB.mkdir(parents=True, exist_ok=True)

print("ROOT:", ROOT)
print("DATA:", DATA)

# Load train/test
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")
print("Loaded train/test:", train.shape, test.shape)

# 1) Locate images folder (pick the one with most .jpg)
candidates = [
    ROOT/"images_dl",
    ROOT/"images",
    ROOT/"data"/"images",
    ROOT/"notebooks"/"images_dl",
]
def jpg_count(d: Path) -> int:
    return sum(1 for _ in d.glob("*.jpg")) if d.exists() else 0

cand_counts = sorted([(d, jpg_count(d)) for d in candidates], key=lambda x:x[1], reverse=True)
for d,c in cand_counts:
    print("Candidate:", d, "jpg:", c)

if cand_counts and cand_counts[0][1] > 0:
    IMG_DIR = cand_counts[0][0]
else:
    raise FileNotFoundError(
        "Could not find images. Set IMG_DIR to your downloaded images folder (with many .jpg files)."
    )
print("Using IMG_DIR:", IMG_DIR.resolve())

# 2) GPU sanity
assert torch.cuda.is_available(), "CUDA not available in this kernel/venv."
device = "cuda"
print("GPU:", torch.cuda.get_device_name(0), "| Torch:", torch.__version__)

# 3) Tiny GPU throughput test (fast)
sample_ids = test["sample_id"].head(256).tolist()
paths = [IMG_DIR / f"{sid}.jpg" for sid in sample_ids]
ok = [p.exists() for p in paths]
print(f"Found {sum(ok)}/256 images for quick test in {IMG_DIR.name}")

imgs = []
for p in [pp for pp,flag in zip(paths,ok) if flag][:128]:
    try:
        imgs.append(Image.open(p).convert("RGB"))
    except (UnidentifiedImageError, FileNotFoundError):
        pass
print("Timing encode on", len(imgs), "imgs ...")

model = SentenceTransformer("clip-ViT-B-32", device=device)
t0 = time.perf_counter()
_ = model.encode(
    imgs,
    batch_size=64,  # increase to 96/128 if VRAM allows
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
dt = time.perf_counter()-t0
if len(imgs) > 0:
    print(f"Throughput: {len(imgs)/dt:.1f} img/s (GPU OK)")

# 4) Full encode with caching (runs once; reuses .npy afterwards)
def load_imgs(ids):
    out = []
    blank = Image.new("RGB", (224,224), color=0)
    for sid in ids:
        p = IMG_DIR / f"{sid}.jpg"
        try:
            out.append(Image.open(p).convert("RGB"))
        except Exception:
            out.append(blank)
    return out

def encode_images(ids, save_path, batch_size=96):
    save_path = Path(save_path)
    if save_path.exists():
        arr = np.load(save_path, mmap_mode="r")
        if arr.shape[0] == len(ids):
            print("Loaded cached:", save_path)
            return np.array(arr)
        print("Cache size mismatch; recomputing:", save_path)
    imgs_all = load_imgs(ids)
    t0 = time.perf_counter()
    vec = model.encode(
        imgs_all, batch_size=batch_size, show_progress_bar=True,
        convert_to_numpy=True, normalize_embeddings=True
    )
    print(f"Full encode throughput: {len(imgs_all)/(time.perf_counter()-t0):.1f} img/s")
    np.save(save_path, vec)
    print("Saved:", save_path, vec.shape)
    return vec

tr_ids = train["sample_id"].tolist()
te_ids = test["sample_id"].tolist()

tr_img_vec = encode_images(tr_ids, EMB/"train_clip_img.npy", batch_size=96)
te_img_vec = encode_images(te_ids, EMB/"test_clip_img.npy",  batch_size=96)
print("Done. train/test img emb shapes:", tr_img_vec.shape, te_img_vec.shape)


ROOT: d:\amazon ML challenge
DATA: d:\amazon ML challenge\data
Loaded train/test: (75000, 4) (75000, 3)
Candidate: d:\amazon ML challenge\images_dl jpg: 146588
Candidate: d:\amazon ML challenge\images jpg: 110
Candidate: d:\amazon ML challenge\data\images jpg: 0
Candidate: d:\amazon ML challenge\notebooks\images_dl jpg: 0
Using IMG_DIR: D:\amazon ML challenge\images_dl
GPU: NVIDIA GeForce RTX 4070 Laptop GPU | Torch: 2.6.0+cu124
Found 256/256 images for quick test in images_dl
Timing encode on 128 imgs ...


Batches: 100%|██████████| 2/2 [00:04<00:00,  2.12s/it]


Throughput: 30.2 img/s (GPU OK)


In [None]:
# === Encode CLIP image embeddings with live progress + resume ===
import time, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import open_clip

# ---- CONFIG (edit only these if needed) ----
ROOT = Path(r"D:\amazon ML challenge")
DATA = ROOT / "data"
IMG_DIR = Path(r"D:\amazon ML challenge\images_dl")
ART  = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)
EMB  = ART / "emb_cache";  EMB.mkdir(parents=True, exist_ok=True)
BATCH_SIZE   = 256
NUM_WORKERS  = 6
CHUNK_SIZE   = 10_000  # write partial every 10k rows
MODEL_NAME   = "ViT-B-32"
PRETRAINED   = "laion2b_s34b_b79k"
# -------------------------------------------

assert torch.cuda.is_available(), "CUDA not available in this kernel/venv."
device = "cuda"
print("GPU:", torch.cuda.get_device_name(0), "| Torch:", torch.__version__, flush=True)

# Load ids
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")
tr_ids = train["sample_id"].tolist()
te_ids = test["sample_id"].tolist()
print(f"IDs loaded: train={len(tr_ids)}, test={len(te_ids)}", flush=True)

# Model + preprocess (fp16)
torch.backends.cudnn.benchmark = True
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED, device=device)
model.eval()
model = model.to(dtype=torch.float16)
print("Model ready:", MODEL_NAME, PRETRAINED, flush=True)

# Dataset
class ImgDS(Dataset):
    def __init__(self, ids, img_dir, preprocess, start=0):
        self.ids = ids[start:]
        self.dir = Path(img_dir)
        self.preprocess = preprocess
        self.blank = Image.new("RGB", (224,224), color=0)
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        sid = self.ids[i]
        p = self.dir / f"{sid}.jpg"
        try:
            img = Image.open(p).convert("RGB")
        except Exception:
            img = self.blank
        return self.preprocess(img), sid

def encode_split_progress(ids, save_path, model, preprocess, img_dir,
                          batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, chunk_size=CHUNK_SIZE):
    save_path = Path(save_path)
    tmp_path  = save_path.with_suffix(".partial.npy")

    # Resume if partial exists
    start_idx = 0
    partial = None
    if tmp_path.exists():
        partial = np.load(tmp_path, mmap_mode="r")
        start_idx = partial.shape[0]
        print(f"[resume] {tmp_path.name}: {start_idx}/{len(ids)} rows", flush=True)

    # If full exists and matches length, return it
    if save_path.exists():
        arr = np.load(save_path, mmap_mode="r")
        if arr.shape[0] == len(ids):
            print(f"[cache] Loaded {save_path.name}: {arr.shape}", flush=True)
            return np.array(arr)

    ds = ImgDS(ids, img_dir, preprocess, start=start_idx)
    if len(ds) == 0:
        if partial is not None:
            np.save(save_path, np.array(partial))
            tmp_path.unlink(missing_ok=True)
            print(f"[finalize] Saved {save_path.name} {partial.shape}", flush=True)
            return np.array(partial)
        else:
            raise RuntimeError("No samples to encode.")

    dl = DataLoader(
        ds, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True,
        persistent_workers=True
    )

    embs = []
    processed = start_idx
    t0 = time.perf_counter()
    last = t0
    print(f"[start] {save_path.name} | total={len(ids)} | bs={batch_size} | workers={num_workers}", flush=True)

    with torch.inference_mode():
        for bi, (xb, _) in enumerate(dl, 1):
            xb = xb.to(device, non_blocking=True).to(dtype=torch.float16)
            z  = model.encode_image(xb)
            z  = torch.nn.functional.normalize(z.float(), dim=1)
            embs.append(z.cpu())
            processed += xb.size(0)

            # progress line every ~5 batches
            if bi % 5 == 0:
                now = time.perf_counter()
                inst = (xb.size(0)*5) / (now - last)
                overall = processed / (now - t0)
                last = now
                print(f"[{processed:6d}/{len(ids)}] ~{int(overall)} img/s (inst ~{int(inst)})", flush=True)

            # periodic checkpoint
            if processed % chunk_size == 0:
                E_chunk = torch.cat(embs).numpy()
                if partial is not None:
                    E_chunk = np.vstack([partial, E_chunk])
                np.save(tmp_path, E_chunk)
                print(f"[ckpt] {tmp_path.name} -> {E_chunk.shape}", flush=True)
                embs, partial = [], E_chunk  # reset buffer

    # finalize
    E = torch.cat(embs).numpy() if embs else np.empty((0, 512), np.float32)
    if partial is not None:
        E = np.vstack([partial, E])
    np.save(save_path, E)
    tmp_path.unlink(missing_ok=True)
    dt = time.perf_counter() - t0
    print(f"[done] {save_path.name} {E.shape} | ~{int(len(ids)/dt)} img/s", flush=True)
    return E

# ---- CALL THE FUNCTION (this actually runs it) ----
train_out = EMB / "train_clip_openclip.npy"
test_out  = EMB / "test_clip_openclip.npy"

tr_img_vec = encode_split_progress(tr_ids, train_out, model, preprocess, IMG_DIR)
te_img_vec = encode_split_progress(te_ids,  test_out,  model, preprocess, IMG_DIR)

print("Embeddings ready:", tr_img_vec.shape, te_img_vec.shape, flush=True)


  from .autonotebook import tqdm as notebook_tqdm


GPU: NVIDIA GeForce RTX 4070 Laptop GPU | Torch: 2.6.0+cu124
IDs loaded: train=75000, test=75000
Model ready: ViT-B-32 laion2b_s34b_b79k
[start] train_clip_openclip.npy | total=75000 | bs=256 | workers=6


In [2]:
# Rebuild dataset + model (safe on Windows)
import torch, time
from pathlib import Path
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import open_clip

ROOT = Path(r"D:\amazon ML challenge")
DATA = ROOT / "data"
IMG_DIR = Path(r"D:\amazon ML challenge\images_dl")
ART = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)
EMB = ART / "emb_cache"; EMB.mkdir(parents=True, exist_ok=True)

assert torch.cuda.is_available(), "CUDA not available"
device = "cuda"
print("GPU:", torch.cuda.get_device_name(0), "| torch:", torch.__version__)

train = pd.read_csv(DATA/"train.csv"); test = pd.read_csv(DATA/"test.csv")
tr_ids = train["sample_id"].tolist(); te_ids = test["sample_id"].tolist()
print("IDs:", len(tr_ids), len(te_ids))

torch.backends.cudnn.benchmark = True
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="laion2b_s34b_b79k", device=device
)
model.eval(); model = model.to(dtype=torch.float16)

class ImgDS(Dataset):
    def __init__(self, ids, img_dir, preprocess, start=0):
        self.ids = ids[start:]; self.dir = Path(img_dir)
        self.preprocess = preprocess; self.blank = Image.new("RGB",(224,224),color=0)
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        sid = self.ids[i]; p = self.dir / f"{sid}.jpg"
        try: img = Image.open(p).convert("RGB")
        except Exception: img = self.blank
        return self.preprocess(img), sid


  from .autonotebook import tqdm as notebook_tqdm


GPU: NVIDIA GeForce RTX 4070 Laptop GPU | torch: 2.6.0+cu124
IDs: 75000 75000


In [3]:
from torch.utils.data import DataLoader
ds_dbg = ImgDS(tr_ids, IMG_DIR, preprocess, start=0)
dl_dbg = DataLoader(ds_dbg, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
import time
t0 = time.perf_counter()
xb, _ = next(iter(dl_dbg))
print("Loaded batch:", xb.shape, "in", round(time.perf_counter()-t0,2), "s")
xb = xb.to("cuda", non_blocking=True).to(dtype=torch.float16)
with torch.inference_mode(): z = model.encode_image(xb)
print("Encoded OK, emb:", z.shape)


Loaded batch: torch.Size([64, 3, 224, 224]) in 1.83 s
Encoded OK, emb: torch.Size([64, 512])


In [5]:
# === Reliable OpenCLIP encoder (Windows-safe, resumable, frequent checkpoints) ===
import time, numpy as np, pandas as pd
from pathlib import Path
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import open_clip

# ---- Paths (your project) ----
ROOT = Path(r"D:\amazon ML challenge")
DATA = ROOT / "data"
IMG_DIR = Path(r"D:\amazon ML challenge\images_dl")
ART  = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)
EMB  = ART / "emb_cache";  EMB.mkdir(parents=True, exist_ok=True)

# ---- Load data ----
train = pd.read_csv(DATA/"train.csv"); test  = pd.read_csv(DATA/"test.csv")
tr_ids = train["sample_id"].tolist(); te_ids = test["sample_id"].tolist()

# ---- GPU + model ----
assert torch.cuda.is_available(), "CUDA not available in this kernel/venv."
device = "cuda"; torch.backends.cudnn.benchmark = True
model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k", device=device)
model.eval(); model = model.to(dtype=torch.float16)
print("GPU:", torch.cuda.get_device_name(0), "| Torch:", torch.__version__)

# ---- Dataset ----
class ImgDS(Dataset):
    def __init__(self, ids, img_dir, preprocess, start=0):
        self.ids = ids[start:]; self.dir = Path(img_dir)
        self.preprocess = preprocess; self.blank = Image.new("RGB",(224,224),color=0)
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        sid = self.ids[i]; p = self.dir / f"{sid}.jpg"
        try: img = Image.open(p).convert("RGB")
        except Exception: img = self.blank
        return self.preprocess(img), sid

# ---- Resumable, frequent-checkpoint encoder (workers=0 to avoid Windows hangs) ----
from torch.utils.data import DataLoader

def encode_split_solo(ids, out_path, batch_size=256, chunk=2_000):
    out_path = Path(out_path); tmp = out_path.with_suffix(".partial.npy")

    # resume if partial exists
    start = 0; part = None
    if tmp.exists():
        part = np.load(tmp, mmap_mode="r"); start = part.shape[0]
        print(f"[resume] {tmp.name}: {start}/{len(ids)}")

    # if full exists, return
    if out_path.exists():
        arr = np.load(out_path, mmap_mode="r")
        if arr.shape[0] == len(ids):
            print(f"[cache] {out_path.name}: {arr.shape}"); return np.array(arr)

    ds = ImgDS(ids, IMG_DIR, preprocess, start=start)
    if len(ds)==0:
        if part is not None:
            np.save(out_path, np.array(part)); tmp.unlink(missing_ok=True)
            print(f"[finalize] {out_path.name} {part.shape}"); return np.array(part)
        raise RuntimeError("No samples to encode.")

    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    embs=[]; done=start; t0=time.perf_counter(); last=t0
    print(f"[start] {out_path.name} total={len(ids)} bs={batch_size} workers=0")

    with torch.inference_mode():
        for i,(xb,_) in enumerate(dl,1):
            xb = xb.to("cuda", non_blocking=True).to(dtype=torch.float16)
            z = model.encode_image(xb)
            z = torch.nn.functional.normalize(z.float(), dim=1)
            embs.append(z.cpu()); done += xb.size(0)

            if i%5==0:
                now=time.perf_counter()
                inst = (xb.size(0)*5)/(now-last); overall = done/(now-t0); last=now
                print(f"[{done:6d}/{len(ids)}] ~{int(overall)} img/s (inst ~{int(inst)})", flush=True)

            if done%chunk==0:
                E = torch.cat(embs).numpy()
                if part is not None: E = np.vstack([part,E])
                np.save(tmp, E); print(f"[ckpt] {tmp.name} -> {E.shape}", flush=True)
                embs=[]; part=E

    E = torch.cat(embs).numpy() if embs else np.empty((0,512),np.float32)
    if part is not None: E = np.vstack([part,E])
    np.save(out_path, E); tmp.unlink(missing_ok=True)
    dt=time.perf_counter()-t0
    print(f"[done] {out_path.name} {E.shape} ~{int(len(ids)/dt)} img/s"); return E

# ---- RUN (resumable; prints progress quickly) ----
tr_emb = encode_split_solo(tr_ids, EMB/"train_clip_openclip.npy", batch_size=256, chunk=2_000)
te_emb = encode_split_solo(te_ids, EMB/"test_clip_openclip.npy",  batch_size=256, chunk=2_000)
print("Embeddings:", tr_emb.shape, te_emb.shape)


GPU: NVIDIA GeForce RTX 4070 Laptop GPU | Torch: 2.6.0+cu124
[start] train_clip_openclip.npy total=75000 bs=256 workers=0
[  1280/75000] ~43 img/s (inst ~43)
[  2560/75000] ~41 img/s (inst ~39)
[  3840/75000] ~39 img/s (inst ~36)
[  5120/75000] ~38 img/s (inst ~34)
[  6400/75000] ~36 img/s (inst ~32)
[  7680/75000] ~34 img/s (inst ~27)
[  8960/75000] ~33 img/s (inst ~28)
[ 10240/75000] ~32 img/s (inst ~27)
[ 11520/75000] ~32 img/s (inst ~27)
[ 12800/75000] ~31 img/s (inst ~28)
[ 14080/75000] ~31 img/s (inst ~27)
[ 15360/75000] ~30 img/s (inst ~26)
[ 16640/75000] ~30 img/s (inst ~24)
[ 17920/75000] ~29 img/s (inst ~24)
[ 19200/75000] ~29 img/s (inst ~25)
[ 20480/75000] ~28 img/s (inst ~24)
[ 21760/75000] ~28 img/s (inst ~24)
[ 23040/75000] ~28 img/s (inst ~24)
[ 24320/75000] ~28 img/s (inst ~24)
[ 25600/75000] ~28 img/s (inst ~24)
[ 26880/75000] ~27 img/s (inst ~24)
[ 28160/75000] ~27 img/s (inst ~24)
[ 29440/75000] ~27 img/s (inst ~24)
[ 30720/75000] ~27 img/s (inst ~25)
[ 32000/75000]

In [6]:
from sklearn.linear_model import Ridge
import numpy as np, pandas as pd
from pathlib import Path

ART = Path(r"D:\amazon ML challenge\artifacts")

y = train["price"].to_numpy(float)
ridge_i = Ridge(alpha=1.0, random_state=42).fit(tr_emb, y)
pred_img = np.clip(ridge_i.predict(te_emb), 1e-6, None)

pd.DataFrame({"sample_id": test["sample_id"], "price": pred_img}).to_csv(ART/"submission_img_clip.csv", index=False)
print("Saved:", (ART/"submission_img_clip.csv").resolve())

Saved: D:\amazon ML challenge\artifacts\submission_img_clip.csv


In [7]:
import pandas as pd
from pathlib import Path

ART = Path(r"D:\amazon ML challenge\artifacts")
df_txt = pd.read_csv(ART/"submission_wc_log.csv")      # your best log-target text model
df_img = pd.read_csv(ART/"submission_img_clip.csv")    # just created

m = df_txt.merge(df_img, on="sample_id", suffixes=("_txt","_img"))
m["price"] = 0.9*m["price_txt"] + 0.1*m["price_img"]   # conservative blend
m[["sample_id","price"]].to_csv(ART/"submission_ensemble_img_v1.csv", index=False)
print("Saved:", (ART/"submission_ensemble_img_v1.csv").resolve())


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_img_v1.csv


In [8]:
import pandas as pd
from pathlib import Path

ROOT = Path(r"D:\amazon ML challenge")
ART, DATA = ROOT/"artifacts", ROOT/"data"

base = "submission_ensemble_img_v1.csv"  # swap to "submission_wc_log.csv" if you skip images
df = pd.read_csv(ART/base)[["sample_id","price"]]
df["price"] = df["price"].astype(float).clip(lower=1e-6)

order = pd.read_csv(DATA/"test.csv")["sample_id"]
final = order.to_frame().merge(df, on="sample_id", how="left")
assert final["price"].notna().all()
final.to_csv(ART/"test_out.csv", index=False)
print("WROTE:", (ART/"test_out.csv").resolve(), "| rows:", len(final))


WROTE: D:\amazon ML challenge\artifacts\test_out.csv | rows: 75000


In [9]:
for w in (0.95, 0.9, 0.85, 0.8):
    out = ART/f"submission_blend_text{int(w*100)}_img{int((1-w)*100)}.csv"
    tmp = m.copy()
    tmp["price"] = w*tmp["price_txt"] + (1-w)*tmp["price_img"]
    tmp[["sample_id","price"]].to_csv(out, index=False)
    print("Saved:", out.name)


Saved: submission_blend_text95_img5.csv
Saved: submission_blend_text90_img9.csv
Saved: submission_blend_text85_img15.csv
Saved: submission_blend_text80_img19.csv
