In [8]:
# Minimal cleaner (only if catalog_content_clean doesn't exist yet)
if "catalog_content_clean" not in locals() and "catalog_content_clean" not in train.columns if 'train' in locals() else [False]:
    import re
    def clean_text(s) -> str:
        s = "" if s is None else str(s)
        s = s.lower()
        s = re.sub(r"http\S+|www\S+|https\S+", "", s)
        s = re.sub(r"[^a-z0-9 ]+", " ", s)
        s = re.sub(r"\s+", " ", s)
        return s.strip()


In [2]:
# Robust project-root + data loader for 02_stage2.ipynb

import re, gc, math, numpy as np, pandas as pd
from pathlib import Path

def find_project_root():
    """Walk upward until we find a folder that actually contains data/train.csv (or common markers)."""
    p = Path.cwd()
    for _ in range(8):
        # Preferred: explicit presence of data files
        if (p / "data" / "train.csv").exists() and (p / "data" / "test.csv").exists():
            return p
        # Fallback: repo markers + data dir exists
        if any((p / m).exists() for m in ("requirements.txt", ".git", "HANDOFF.md")) and (p / "data").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not locate project root containing a 'data/' folder.")

ROOT = find_project_root()
DATA = ROOT / "data"
ART  = ROOT / "artifacts"
ART.mkdir(parents=True, exist_ok=True)

print("Notebook CWD :", Path.cwd())
print("Project ROOT :", ROOT)
print("DATA exists? :", DATA.exists(), "|", (DATA / "train.csv").exists(), (DATA / "test.csv").exists())
print("ART path     :", ART)

# Load data
train = pd.read_csv(DATA / "train.csv")
test  = pd.read_csv(DATA / "test.csv")

# Choose text column (use cleaned if present, else raw)
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Safe string views for modeling
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values


Notebook CWD : d:\amazon ML challenge\notebooks
Project ROOT : d:\amazon ML challenge
DATA exists? : True | True True
ART path     : d:\amazon ML challenge\artifacts


In [3]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import pandas as pd

identity = FunctionTransformer(lambda s: s, validate=False)

features = FeatureUnion([
    ("word", Pipeline([
        ("id", identity),
        ("tfidf", TfidfVectorizer(
            analyzer="word", ngram_range=(1,2),
            max_features=350_000, min_df=2
        ))
    ])),
    ("char", Pipeline([
        ("id", identity),
        ("tfidf", TfidfVectorizer(
            analyzer="char_wb", ngram_range=(3,6),
            max_features=300_000, min_df=2
        ))
    ])),
], n_jobs=1)

ridge = Ridge(alpha=1.2, random_state=42)

pipe_wc = Pipeline([
    ("features", features),
    ("ridge", ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
maes = []
for tr, va in cv.split(X_text_tr):
    pipe_wc.fit(X_text_tr.iloc[tr], y[tr])
    pred = pipe_wc.predict(X_text_tr.iloc[va])
    maes.append(mean_absolute_error(y[va], pred))
print("Word+Char TF-IDF Ridge 5-fold MAE:", np.mean(maes), "±", np.std(maes))

# Fit full and write submission
pipe_wc.fit(X_text_tr, y)
pred_wc = pipe_wc.predict(X_text_te)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc}).to_csv(ART / "submission_wc.csv", index=False)
print("Saved:", (ART / "submission_wc.csv").resolve())

Word+Char TF-IDF Ridge 5-fold MAE: 13.855923438301284 ± 0.11941516491554557
Saved: D:\amazon ML challenge\artifacts\submission_wc.csv


In [8]:
# === Stage-2B minimal: word TF-IDF + cheap meta features → pred_tm + save CSV ===
from pathlib import Path
import re, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

# Resolve paths and data (works whether you're in notebooks/ or project root)
ROOT = globals().get("ROOT", Path.cwd())
ROOT = ROOT if isinstance(ROOT, Path) else Path(ROOT)
DATA = Path(globals().get("DATA", ROOT / "data"))
ART  = Path(globals().get("ART",  ROOT / "artifacts"))
ART.mkdir(parents=True, exist_ok=True)

# Load data if not already present
if "train" not in globals() or "test" not in globals():
    train = pd.read_csv(DATA / "train.csv")
    test  = pd.read_csv(DATA / "test.csv")

TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"
X_text_tr = train[TEXT_COL].fillna("").astype(str)
X_text_te = test[TEXT_COL].fillna("").astype(str)
y = train["price"].astype(float).values

# ---- meta feature extractor ----
def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)

    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len).astype(float)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan).astype(float)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan).astype(float)

    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1).astype(float)

    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)

    df = pd.DataFrame({
        "num_count": num_count,
        "max_num": max_num,
        "min_num": min_num,
        "pack_n":  pack_n,
    })
    df = pd.concat([df, unit_df], axis=1).fillna(0.0)
    return df

meta_tr = extract_meta(X_text_tr)
meta_te = extract_meta(X_text_te)

train_aug = pd.DataFrame({TEXT_COL: X_text_tr})
test_aug  = pd.DataFrame({TEXT_COL: X_text_te})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

numeric_cols = meta_tr.columns.tolist()

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop")

model = Ridge(alpha=1.0, random_state=42)
pipe_tm = Pipeline([("ct", ct), ("ridge", model)])

# Fit full + predict
pipe_tm.fit(train_aug, y)
pred_tm = pipe_tm.predict(test_aug)

# Save submission for blending
out_tm = ART / "submission_word_meta.csv"
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_tm}).to_csv(out_tm, index=False)
print("Saved:", out_tm.resolve(), "| rows:", len(pred_tm))


Saved: D:\amazon ML challenge\artifacts\submission_word_meta.csv | rows: 75000


In [9]:
# Simple average of the two Stage-2 submissions
import pandas as pd
from pathlib import Path

ART = Path(ART)  # reuse from earlier if defined
df_wc = pd.read_csv(ART / "submission_wc.csv")
df_tm = pd.read_csv(ART / "submission_word_meta.csv")

df = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc", "_tm"))
df["price"] = 0.5*df["price_wc"] + 0.5*df["price_tm"]
out = ART / "submission_ensemble_v1.csv"
df[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(df))


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v1.csv | rows: 75000


In [13]:
from pathlib import Path
import pandas as pd

# Reuse ART if defined; else resolve it from project root
ART = Path(globals().get("ART", Path.cwd() / "artifacts"))

print("ART =", ART.resolve())
print("Available submissions:", [p.name for p in ART.glob("submission_*.csv")])

df = pd.read_csv(ART / "submission_ensemble_v1.csv")
assert len(df)==75000 and df["sample_id"].is_unique and not df["price"].isna().any()
print(df["price"].min(), df["price"].max(), df["price"].median())


ART = D:\amazon ML challenge\artifacts
Available submissions: ['submission_ensemble_v1.csv', 'submission_final.csv', 'submission_wc.csv', 'submission_word_meta.csv']
-57.24441578355 427.01332224186393 19.36302922582783


In [16]:
from pathlib import Path
import pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge

ART = Path(globals().get("ART", Path.cwd() / "artifacts"))
TEXT_COL = "catalog_content_clean" if "catalog_content_clean" in train.columns else "catalog_content"

# Encode
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
tr_vec = model.encode(train[TEXT_COL].fillna("").tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
te_vec = model.encode(test[TEXT_COL].fillna("").tolist(),  batch_size=512, show_progress_bar=True, convert_to_numpy=True)

# Fit + predict
ridge_emb = Ridge(alpha=1.0, random_state=42).fit(tr_vec, train["price"].values)
pred_emb = ridge_emb.predict(te_vec)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_emb}).to_csv(ART/"submission_sbert.csv", index=False)

# 3-way blend with existing files
df_wc = pd.read_csv(ART/"submission_wc.csv")
df_tm = pd.read_csv(ART/"submission_word_meta.csv")
df_sb = pd.read_csv(ART/"submission_sbert.csv")

m = df_wc.merge(df_tm, on="sample_id", suffixes=("_wc","_tm")).merge(df_sb, on="sample_id")
m["price"] = 0.4*m["price_wc"] + 0.4*m["price_tm"] + 0.2*m["price"]
out = ART/"submission_ensemble_v2.csv"
m[["sample_id","price"]].to_csv(out, index=False)
print("Saved:", out.resolve(), "| rows:", len(m))


Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]
Batches: 100%|██████████| 147/147 [01:09<00:00,  2.12it/s]


Saved: D:\amazon ML challenge\artifacts\submission_ensemble_v2.csv | rows: 75000


In [10]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer

# Identity transformer so FeatureUnion can feed the same text to two vectorizers
identity = FunctionTransformer(lambda s: s, validate=False)

features = FeatureUnion([
    ("word", Pipeline([
        ("id", identity),
        ("tfidf", TfidfVectorizer(
            analyzer="word",
            ngram_range=(1,2),
            max_features=350_000,
            min_df=2
        ))
    ])),
    ("char", Pipeline([
        ("id", identity),
        ("tfidf", TfidfVectorizer(
            analyzer="char_wb",
            ngram_range=(3,6),
            max_features=300_000,
            min_df=2
        ))
    ])),
], n_jobs=1)  # keep 1 for RAM safety

ridge = Ridge(alpha=1.2, random_state=42)

pipe_wc = Pipeline([
    ("features", features),
    ("ridge", ridge),
])

cv = KFold(n_splits=5, shuffle=True, random_state=42)
maes = []
for tr, va in cv.split(X_text_tr):
    pipe_wc.fit(X_text_tr.iloc[tr], y[tr])
    pred = pipe_wc.predict(X_text_tr.iloc[va])
    maes.append(mean_absolute_error(y[va], pred))
print("Word+Char TF-IDF Ridge 5-fold MAE:", np.mean(maes), "±", np.std(maes))

# Fit full and predict
pipe_wc.fit(X_text_tr, y)
pred_wc = pipe_wc.predict(X_text_te)

pd.DataFrame({"sample_id": test["sample_id"], "price": pred_wc}) \
  .to_csv(ART / "submission_wc.csv", index=False)


KeyboardInterrupt: 

In [3]:
def extract_meta(s: pd.Series) -> pd.DataFrame:
    s = s.fillna("").astype(str)
    # numbers present
    nums = s.str.findall(r"(?<![a-zA-Z])(\d+(?:\.\d+)?)")
    num_count = nums.apply(len)
    max_num = nums.apply(lambda xs: max(map(float, xs)) if xs else np.nan)
    min_num = nums.apply(lambda xs: min(map(float, xs)) if xs else np.nan)

    # pack size: "pack of 2", "2-pack", "2 pack"
    pack = s.str.extract(r"(?:pack of|pack)\s*(\d+)|(\d+)\s*-\s*pack|(\d+)\s*pack", expand=True)
    pack_n = pack.apply(lambda row: next((int(x) for x in row if pd.notna(x)), np.nan), axis=1)

    # volumes/weights normalized
    # capture '500 ml', '0.5 l', '12 oz', '250 g', '1 kg', '1 lb'
    uw = s.str.findall(r"(\d+(?:\.\d+)?)\s*(ml|l|oz|g|kg|lb)")
    def norm_units(pairs):
        ml = g = None
        for val, unit in pairs:
            v = float(val)
            if unit == "ml": ml = (ml or 0) + v
            elif unit == "l":  ml = (ml or 0) + v*1000
            elif unit == "oz": g  = (g  or 0) + v*28.3495
            elif unit == "g":  g  = (g  or 0) + v
            elif unit == "kg": g  = (g  or 0) + v*1000
            elif unit == "lb": g  = (g  or 0) + v*453.592
        return pd.Series({"vol_ml": ml if ml is not None else np.nan,
                          "wt_g":  g  if g  is not None else np.nan})
    unit_df = uw.apply(norm_units)

    df = pd.DataFrame({
        "num_count": num_count.astype(float),
        "max_num": max_num.astype(float),
        "min_num": min_num.astype(float),
        "pack_n":  pack_n.astype(float),
    })
    df = pd.concat([df, unit_df], axis=1)
    # fill NaNs with 0 for modeling
    return df.fillna(0.0)

meta_tr = extract_meta(train[TEXT_COL])
meta_te = extract_meta(test[TEXT_COL])

meta_tr.head(), meta_te.head()


NameError: name 'train' is not defined

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Build a DataFrame combining text + meta
train_aug = pd.DataFrame({TEXT_COL: X_text_tr})
test_aug  = pd.DataFrame({TEXT_COL: X_text_te})
for c in meta_tr.columns:
    train_aug[c] = meta_tr[c]
    test_aug[c]  = meta_te[c]

numeric_cols = meta_tr.columns.tolist()

ct = ColumnTransformer([
    ("tfidf_word", TfidfVectorizer(analyzer="word", ngram_range=(1,2), max_features=300_000), TEXT_COL),
    ("num", Pipeline([("scale", StandardScaler(with_mean=False))]), numeric_cols),
], remainder="drop", n_jobs=None)

model = Ridge(alpha=1.0, random_state=42)

pipe_tm = Pipeline([
    ("ct", ct),
    ("ridge", model),
])

maes=[]
for tr, va in cv.split(train_aug):
    pipe_tm.fit(train_aug.iloc[tr], y[tr])
    pp = pipe_tm.predict(train_aug.iloc[va])
    maes.append(mean_absolute_error(y[va], pp))
print("TF-IDF(word)+Meta Ridge 5-fold MAE:", np.mean(maes), "±", np.std(maes))

pipe_tm.fit(train_aug, y)
pred_tm = pipe_tm.predict(test_aug)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_tm}) \
  .to_csv(ART / "submission_word_meta.csv", index=False)


NameError: name 'TEXT_COL' is not defined

In [5]:
# We already have pred_wc (word+char) and pred_tm (word+meta).
# Uniform average is a solid start:
ens_pred = 0.5 * pred_wc + 0.5 * pred_tm
pd.DataFrame({"sample_id": test["sample_id"], "price": ens_pred}) \
  .to_csv(ART / "submission_ensemble_v1.csv", index=False)


NameError: name 'pred_wc' is not defined

In [6]:
# pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import Ridge

emb_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")  # 384-dim

# Encode in manageable batches
tr_emb = emb_model.encode(X_text_tr.tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)
te_emb = emb_model.encode(X_text_te.tolist(), batch_size=512, show_progress_bar=True, convert_to_numpy=True)

ridge_emb = Ridge(alpha=1.0, random_state=42)
maes=[]
for tr, va in cv.split(tr_emb):
    ridge_emb.fit(tr_emb[tr], y[tr])
    pp = ridge_emb.predict(tr_emb[va])
    maes.append(mean_absolute_error(y[va], pp))
print("Sentence-Embeddings Ridge 5-fold MAE:", np.mean(maes), "±", np.std(maes))

ridge_emb.fit(tr_emb, y)
pred_emb = ridge_emb.predict(te_emb)
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_emb}) \
  .to_csv(ART / "submission_sbert.csv", index=False)

# Try a 3-way blend
pred_blend3 = 0.4*pred_wc + 0.4*pred_tm + 0.2*pred_emb
pd.DataFrame({"sample_id": test["sample_id"], "price": pred_blend3}) \
  .to_csv(ART / "submission_ensemble_v2.csv", index=False)


ModuleNotFoundError: No module named 'sentence_transformers'