In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. Load Data ---
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
print("Training data loaded successfully.")

# --- 2. Feature Engineering ---
def extract_quantity(text):
    text = text.lower()
    patterns = [
        r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)',
        r'(\d+)\s*count', r'(\d+)\s*ct', r'(\d+)\s*ea', r'total (\d+)', r'(\d+)\s*servings',
        r'(\d+)\s*pcs', r'(\d+)\s*stems'
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match: return int(match.group(1))
    return 1

def extract_brand(text):
    # This is a simplified extractor; we can make it more robust later.
    match = re.search(r'^(?:brand|manufacturer):\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    if match: return match.group(1).strip().lower()
    return 'unknown'

train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
train_df['brand'] = train_df['catalog_content'].apply(extract_brand)

# Remove rows where quantity is 0 to avoid division by zero
train_df = train_df[train_df['quantity'] > 0]
print(f"Data shape after cleaning: {train_df.shape}")

# --- 3. Create the New Price-Per-Unit Target ---
train_df['price_per_unit'] = train_df['price'] / train_df['quantity']
print("Created 'price_per_unit' target.")

# --- 4. Create Hold-Out Set ---
X = train_df[['catalog_content', 'quantity', 'brand']]
y = train_df[['price', 'price_per_unit', 'quantity']] # Pass all needed columns for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

y_train_log_ppu = np.log1p(y_train['price_per_unit'])

# --- 5. Build the V3 Pipeline ---
best_params = {
    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188,
    'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06,
    'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5,
    'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1
}

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
        ('brand', OneHotEncoder(handle_unknown='ignore'), ['brand']),
        ('numeric', 'passthrough', ['quantity'])
    ])

pipeline_v3 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

# --- 6. Train and Evaluate ---
print("\nTraining V3 PPU model...")
pipeline_v3.fit(X_train, y_train_log_ppu)

print("\nEvaluating V3 PPU model...")
val_preds_log_ppu = pipeline_v3.predict(X_val)
val_preds_ppu = np.expm1(val_preds_log_ppu)
val_preds_ppu[val_preds_ppu < 0] = 0

# Reconstruct the final price from the per-unit prediction
final_predictions = val_preds_ppu * X_val['quantity']

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v3_smape = smape(y_val['price'], final_predictions)

print("\n--- Model Performance Comparison ---")
print(f"V1 Model SMAPE (text only, predicting total price): 51.9241")
print(f"V3 Model SMAPE (predicting price-per-unit): {v3_smape:.4f}")

Training data loaded successfully.
Data shape after cleaning: (74983, 6)
Created 'price_per_unit' target.

Training V3 PPU model...

Evaluating V3 PPU model...

--- Model Performance Comparison ---
V1 Model SMAPE (text only, predicting total price): 51.9241
V3 Model SMAPE (predicting price-per-unit): 53.6539
