In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. Load Data & Define Feature Functions ---
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')

def extract_quantity(text):
    text = text.lower()
    patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count']
    for pattern in patterns:
        match = re.search(pattern, text)
        if match: return int(match.group(1))
    return 1

train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
print("Quantity feature created.")

# --- 2. Create Hold-Out Set ---
X = train_df[['catalog_content', 'quantity']]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 3. Build the Hybrid V5 Pipeline ---
best_params = { 'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06, 'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5, 'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
        ('numeric', 'passthrough', ['quantity'])
    ])

pipeline_v5 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

# --- 4. Train and Evaluate the V5 Model ---
print("\nTraining V5 model (V1 + quantity)...")
pipeline_v5.fit(X_train, y_train_log)

print("\nEvaluating V5 model...")
val_preds_log = pipeline_v5.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v5_smape = smape(y_val, val_preds)

print("\n--- Model Performance Comparison ---")
print(f"V1 Model SMAPE (tuned, text only): 51.9241")
print(f"V5 Model SMAPE (text + quantity): {v5_smape:.4f}")

Quantity feature created.

Training V5 model (V1 + quantity)...

Evaluating V5 model...

--- Model Performance Comparison ---
V1 Model SMAPE (tuned, text only): 51.9241
V5 Model SMAPE (text + quantity): 51.5754
