In [5]:
# =========================================================
# SMART PRODUCT PRICING 2025 – LightGBM + Ridge Blend (Colab Universal)
# =========================================================

!pip install -q lightgbm scikit-learn

import pandas as pd, numpy as np, re, gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from scipy.sparse import hstack
import lightgbm as lgb
from google.colab import files

# =========================================================
# STEP 1: Upload Files
# =========================================================
print("📂 Please upload: train.csv and test.csv")
uploaded = files.upload()

print("\nUploaded files:", list(uploaded.keys()))
TRAIN = "train.csv"
TEST  = "test.csv"
OUT   = "test_out.csv"

# =========================================================
# STEP 2: Helper functions
# =========================================================
def clean_text(s):
    s = str(s).lower()
    return re.sub(r"\s+", " ", s).strip()

def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    mask = denom != 0
    return np.mean(np.abs(y_true[mask] - y_pred[mask]) / denom[mask])

# =========================================================
# STEP 3: Load data
# =========================================================
train = pd.read_csv(TRAIN)
test  = pd.read_csv(TEST)
print(f"✅ Train shape: {train.shape}")
print(f"✅ Test shape:  {test.shape}")

train["catalog_content"] = train["catalog_content"].fillna("").map(clean_text)
test["catalog_content"]  = test["catalog_content"].fillna("").map(clean_text)

# =========================================================
# STEP 4: TF-IDF Features
# =========================================================
tfidf = TfidfVectorizer(min_df=3, max_df=0.9, ngram_range=(1,2), sublinear_tf=True)
full = pd.concat([train["catalog_content"], test["catalog_content"]])
X_all = tfidf.fit_transform(full)
X_train = X_all[:len(train)]
X_test  = X_all[len(train):]

# =========================================================
# STEP 5: Targets
# =========================================================
y = np.log1p(np.clip(train["price"].astype(float).values, 1e-6, None))

kf = KFold(n_splits=5, shuffle=True, random_state=42)

ridge_oof, lgb_oof = np.zeros(len(train)), np.zeros(len(train))
ridge_preds, lgb_preds = np.zeros(len(test)), np.zeros(len(test))

# =========================================================
# STEP 6: Cross-validation training
# =========================================================
for fold, (tr, va) in enumerate(kf.split(X_train)):
    print(f"\n==================== Fold {fold+1}/5 ====================")

    # ---------- Ridge ----------
    rid = Ridge(alpha=2.0, random_state=42)
    rid.fit(X_train[tr], y[tr])
    ridge_oof[va] = rid.predict(X_train[va])
    ridge_preds += rid.predict(X_test) / kf.n_splits

    # ---------- LightGBM ----------
    dtrain = lgb.Dataset(X_train[tr], label=y[tr])
    dvalid = lgb.Dataset(X_train[va], label=y[va])

    params = {
        "objective": "regression",
        "metric": "mae",
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 128,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
        "device_type": "gpu"  # ⚡ GPU acceleration
    }

    # Train LightGBM model
    print("Training LightGBM...")
    lgbm = lgb.train(
        params,
        dtrain,
        num_boost_round=400,
        valid_sets=[dvalid]
    )

    lgb_oof[va] = lgbm.predict(X_train[va])
    lgb_preds += lgbm.predict(X_test) / kf.n_splits
    gc.collect()

# =========================================================
# STEP 7: Blend + Evaluate
# =========================================================
blend_oof = 0.6 * ridge_oof + 0.4 * lgb_oof
blend_pred = 0.6 * ridge_preds + 0.4 * lgb_preds

oof_final = np.expm1(blend_oof)
cv_smape = smape(np.expm1(y), oof_final)
print(f"\n✅ Blend CV SMAPE ≈ {cv_smape:.4f}")

final_pred = np.expm1(blend_pred)
final_pred = np.clip(final_pred, 0.05, None)
final_pred = np.round(final_pred, 2)

# =========================================================
# STEP 8: Save + Download
# =========================================================
out = pd.DataFrame({"sample_id": test["sample_id"], "price": final_pred})
out.to_csv(OUT, index=False)
print(f"\n✅ Saved predictions to: {OUT}")

files.download(OUT)


📂 Please upload: train.csv and test.csv



Uploaded files: []
✅ Train shape: (75000, 4)
✅ Test shape:  (75000, 3)

Training LightGBM...

Training LightGBM...

Training LightGBM...

Training LightGBM...

Training LightGBM...

✅ Blend CV SMAPE ≈ 0.5156

✅ Saved predictions to: test_out.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>