In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split

# --- 1. Load the TRAINING Data and the Final Trained Model ---
# We use the training data to create a validation set for analysis.
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')

model_filename = 'lgbm_price_model_v1.joblib'
final_pipeline = joblib.load(model_filename)
print("Training data and final model loaded.")

# --- 2. Create the Same Hold-Out Validation Set Used for Final Scoring ---
# Using the same train_test_split ensures we analyze the exact data our final score was based on.
X = train_df['catalog_content']
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# The model was trained on X_train. We will analyze its errors on X_val.
print(f"Analyzing errors on a validation set of {len(X_val)} samples.")

# --- 3. Make Predictions on the Validation Set ---
val_preds_log = final_pipeline.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

# --- 4. Calculate Per-Sample Error and Create Analysis DataFrame ---
def individual_smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0) * 100

# We need a DataFrame containing the validation data to analyze
analysis_df = pd.DataFrame({
    'catalog_content': X_val,
    'price': y_val,
    'predicted_price': val_preds
})
analysis_df['smape_error'] = individual_smape(analysis_df['price'], analysis_df['predicted_price'])
analysis_df['price_delta'] = np.abs(analysis_df['predicted_price'] - analysis_df['price'])

# --- 5. Sort by Error and Display the Worst Predictions ---
worst_predictions = analysis_df.sort_values(by='smape_error', ascending=False)

print("\n--- Top 50 Worst Predictions (Sorted by SMAPE) ---")
pd.set_option('display.max_colwidth', 500)
display_cols = ['price', 'predicted_price', 'smape_error', 'price_delta', 'catalog_content']
print(worst_predictions[display_cols].head(50))

Training data and final model loaded.
Analyzing errors on a validation set of 15000 samples.





--- Top 50 Worst Predictions (Sorted by SMAPE) ---
          price  predicted_price  smape_error  price_delta  \
24856     1.180        77.339157   193.988728    76.159157   
9273      1.990       123.556952   193.659743   121.566952   
33685     0.680        37.975642   192.963511    37.295642   
59934     0.980        50.035593   192.316075    49.055593   
28165     1.680        82.285427   191.996706    80.605427   
18709   286.770         7.354743   189.997791   279.415257   
26873     1.915        74.243556   189.942036    72.328556   
30774   143.300         3.899200   189.404290   139.400800   
828     107.490         3.358012   187.882465   104.131988   
32428     2.990        93.045809   187.546312    90.055809   
6213    390.980        12.862220   187.260153   378.117780   
58617  2796.000        92.157443   187.236507  2703.842557   
68587   283.980         9.515517   187.031466   274.464483   
22215     0.435        12.641110   186.693290    12.206110   
7128      1.835   