In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from transformers import pipeline as hf_pipeline
import warnings
warnings.filterwarnings("ignore")

# --- 1. Load Data ---
# Note: You may need to adjust this path depending on where you launch Jupyter.
# It's best to launch Jupyter from your 'Amazon' project root.
train_df = pd.read_csv('input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
train_df['title'] = train_df['catalog_content'].apply(lambda t: (m.group(1).strip() if (m := re.search(r'^item name:\s*(.*)', t, re.I | re.M)) else t))
print("Data loaded successfully.")

# --- 2. Feature Engineering ---
print("Performing feature engineering...")
# Advanced Categorization
classifier = hf_pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
candidate_labels = ['electronics', 'grocery', 'home goods', 'health & beauty', 'apparel', 'books', 'automotive', 'sports']
try:
    cached_categories = pd.read_csv('cached_categories_distilbart.csv')
    train_df['category'] = cached_categories['category']
    print("Loaded categories from cache.")
except FileNotFoundError:
    titles_to_classify = train_df['title'].tolist()
    results = classifier(titles_to_classify, candidate_labels, batch_size=64, multi_label=False)
    train_df['category'] = [result['labels'][0] for result in results]
    train_df[['category']].to_csv('cached_categories_distilbart.csv', index=False)
    print("Categorization complete and saved to cache.")

# Brand-Tier Feature
train_df['brand'] = train_df['catalog_content'].apply(lambda t: (m.group(1).strip().lower() if (m := re.search(r'^(?:brand|manufacturer):\s*(.*)', t, re.I | re.M)) and m.group(1) else 'unknown'))
avg_price_by_brand = train_df.groupby('brand')['price'].mean().to_dict()
train_df['brand_avg_price'] = train_df['brand'].map(avg_price_by_brand).fillna(train_df['price'].median())

# Quantity Feature
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count']
    for p in patterns:
        m=re.search(p,text)
        if m: return int(m.group(1))
    return 1
train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
print("All features created.")

# --- 3. Create Hold-Out Set ---
feature_cols = ['catalog_content', 'category', 'brand_avg_price', 'quantity']
X = train_df[feature_cols]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 4. Build the V12 Pipeline ---
best_params = { 'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06, 'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5, 'min_child_samples': 9, 'random_state': 42, 'verbose': -1, 'n_jobs': -1 }
numeric_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
    ('category', OneHotEncoder(handle_unknown='ignore'), ['category']),
    ('numeric', numeric_pipeline, ['brand_avg_price', 'quantity'])
])
pipeline_v12 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

# --- 5. Train and Evaluate ---
print("\nTraining V12 model using CPU parallelization...")
pipeline_v12.fit(X_train, y_train_log)
print("\nEvaluating V12 model...")
val_preds_log = pipeline_v12.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v12_smape = smape(y_val, val_preds)

print("\n--- Model Performance ---")
print(f"V12 Model SMAPE (all features, CPU parallel): {v12_smape:.4f}")

Data loaded successfully.
Performing feature engineering...


Device set to use mps:0


Loaded categories from cache.
All features created.

Training V12 model using CPU parallelization...

Evaluating V12 model...

--- Model Performance ---
V12 Model SMAPE (all features, CPU parallel): 51.7495
