In [1]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ==============================================================================
# ## 1. Setup and Data Loading
# ==============================================================================
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
print("Data loaded successfully.")

# ==============================================================================
# ## 2. Feature Engineering
# ==============================================================================
print("\n--- Engineering Features ---")
# --- Feature Functions ---
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count'];
    for p in patterns:
        m=re.search(p,text)
        if m: return int(m.group(1))
    return 1

def categorize_product(text):
    text = text.lower()
    category_map = {
        'electronics': ['phone', 'camera', 'tv', 'laptop', 'headphone', 'cable', 'charger'],
        'grocery': ['coffee', 'tea', 'snack', 'chocolate', 'organic', 'sugar', 'gluten free', 'sauce', 'candy'],
        'home_goods': ['shed', 'furniture', 'kitchen', 'decor', 'towel', 'blanket', 'pillow'],
        'health_beauty': ['cream', 'shampoo', 'lotion', 'vitamin', 'supplement']
    }
    for cat, keys in category_map.items():
        if any(key in text for key in keys): return cat
    return 'unknown'

# --- Apply Feature Functions ---
numerical_features_df = pd.json_normalize(train_df['catalog_content'].apply(lambda t: {f'feat_{u}': float(m.group(1)) for u, m in {'gb': re.search(r'(\d+\.?\d*)\s*gb', t.lower()), 'oz': re.search(r'(\d+\.?\d*)\s*(?:oz|ounce)', t.lower()), 'inch': re.search(r'(\d+\.?\d*)\s*(?:inch|")', t.lower())}.items() if m}))
train_df = pd.concat([train_df.reset_index(drop=True), numerical_features_df], axis=1)
train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
train_df['category'] = train_df['catalog_content'].apply(categorize_product)

# ==============================================================================
# ## 2.1 NEW: Feature Creation Diagnostics
# ==============================================================================
print("\n--- Feature Diagnostics ---")
print("Distribution of 'quantity' feature (top 5):")
print(train_df['quantity'].value_counts().head())
print("\nDistribution of 'category' feature (%):")
print(train_df['category'].value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))

# ==============================================================================
# ## 3. Model Training and Evaluation
# ==============================================================================
numerical_cols = [col for col in train_df.columns if col.startswith('feat_')]
X = train_df[['catalog_content', 'quantity', 'category'] + numerical_cols]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

best_params = { 'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06, 'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5, 'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
numeric_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'), ('category', OneHotEncoder(handle_unknown='ignore'), ['category']), ('numeric', numeric_pipeline, ['quantity'] + numerical_cols)])
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

print("\n--- Training Model ---")
pipeline.fit(X_train, y_train_log)
val_preds_log = pipeline.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

overall_smape = smape(y_val, val_preds)
print(f"\nOverall SMAPE on validation set: {overall_smape:.4f}")

# ==============================================================================
# ## 4. NEW: Advanced Evaluation Diagnostics
# ==============================================================================
print("\n--- Advanced Evaluation Diagnostics ---")
# Create a DataFrame for analysis
eval_df = X_val.copy()
eval_df['price'] = y_val
eval_df['predicted_price'] = val_preds

# --- Per-Category SMAPE ---
def smape_group(group):
    return smape(group['price'], group['predicted_price'])

category_performance = eval_df.groupby('category').apply(smape_group).sort_values(ascending=False)
print("\nSMAPE Performance by Category:")
print(category_performance)

# --- Price Tier SMAPE ---
price_bins = [0, 25, 100, 500, np.inf]
eval_df['price_tier'] = pd.cut(eval_df['price'], bins=price_bins)
tier_performance = eval_df.groupby('price_tier').apply(smape_group)
print("\nSMAPE Performance by Price Tier:")
print(tier_performance)

Data loaded successfully.

--- Engineering Features ---

--- Feature Diagnostics ---
Distribution of 'quantity' feature (top 5):
quantity
1     43663
6      5957
12     5628
2      4636
3      3968
Name: count, dtype: int64

Distribution of 'category' feature (%):
category
grocery          66.97%
unknown          27.08%
health_beauty     3.16%
home_goods        2.22%
electronics       0.57%
Name: proportion, dtype: object

--- Training Model ---

Overall SMAPE on validation set: 51.4851

--- Advanced Evaluation Diagnostics ---

SMAPE Performance by Category:
category
unknown          60.713490
health_beauty    51.972460
home_goods       49.033802
grocery          47.832410
electronics      38.941247
dtype: float64

SMAPE Performance by Price Tier:
price_tier
(0.0, 25.0]        47.709557
(25.0, 100.0]      57.725240
(100.0, 500.0]     90.726003
(500.0, inf]      180.521334
dtype: float64


  category_performance = eval_df.groupby('category').apply(smape_group).sort_values(ascending=False)
  tier_performance = eval_df.groupby('price_tier').apply(smape_group)
  tier_performance = eval_df.groupby('price_tier').apply(smape_group)
