In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re # Added for title extraction

# --- 1. Load Data and Final Model ---
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')

model_filename = 'lgbm_price_model_v1.joblib'
final_pipeline = joblib.load(model_filename)
print("Training data and final model loaded.")

# --- 2. Create the Validation Set ---
X = train_df['catalog_content']
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Analyzing errors on a validation set of {len(X_val)} samples.")

# --- 3. Make Predictions and Create Analysis DataFrame ---
val_preds_log = final_pipeline.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

analysis_df = pd.DataFrame({
    'catalog_content': X_val,
    'price': y_val,
    'predicted_price': val_preds
})
analysis_df['smape_error'] = smape(analysis_df['price'], analysis_df['predicted_price'])


# --- 4. Advanced Evaluation Diagnostics ---
print("\n--- Advanced Evaluation Diagnostics ---")

# --- Per-Category SMAPE (as before) ---
def categorize_product(text):
    text = text.lower()
    category_map = {
        'electronics': ['phone', 'camera', 'tv', 'laptop', 'headphone'],
        'grocery': ['coffee', 'tea', 'snack', 'organic', 'sugar', 'candy'],
        'home_goods': ['shed', 'furniture', 'kitchen', 'decor', 'towel'],
        'health_beauty': ['cream', 'shampoo', 'lotion', 'vitamin']
    }
    for cat, keys in category_map.items():
        if any(key in text for key in keys): return cat
    return 'unknown'
analysis_df['category'] = analysis_df['catalog_content'].apply(categorize_product)

def smape_group(group):
    return smape(group['price'], group['predicted_price'])

category_performance = analysis_df.groupby('category').apply(smape_group).sort_values(ascending=False)
print("\nSMAPE Performance by Category:")
print(category_performance)

# --- NEW: Granular Price Tier SMAPE ---
# We define more, smaller bins to get a clearer picture.
price_bins = [0, 20, 50, 100, 250, 500, 1000, np.inf]
analysis_df['price_tier'] = pd.cut(analysis_df['price'], bins=price_bins)
tier_performance = analysis_df.groupby('price_tier').apply(smape_group)

print("\nSMAPE Performance by Granular Price Tier:")
print(tier_performance)

Training data and final model loaded.
Analyzing errors on a validation set of 15000 samples.





--- Advanced Evaluation Diagnostics ---

SMAPE Performance by Category:
category
unknown          60.829526
health_beauty    52.031483
home_goods       49.533672
grocery          46.398252
electronics      41.451678
dtype: float64

SMAPE Performance by Granular Price Tier:
price_tier
(0.0, 20.0]         48.266161
(20.0, 50.0]        51.577645
(50.0, 100.0]       67.876806
(100.0, 250.0]      89.750208
(250.0, 500.0]     141.530743
(500.0, 1000.0]    174.182146
(1000.0, inf]      187.236507
dtype: float64


  category_performance = analysis_df.groupby('category').apply(smape_group).sort_values(ascending=False)
  tier_performance = analysis_df.groupby('price_tier').apply(smape_group)
  tier_performance = analysis_df.groupby('price_tier').apply(smape_group)
