In [2]:
# --- 1. SETUP & CONFIGURATION ---
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import lightgbm as lgb
import os
import re
from tqdm.auto import tqdm

# --- CONFIGURATION ---
INPUT_DIR = 'input/'
RANDOM_STATE = 42

# --- 2. HELPER FUNCTIONS ---
def smape(y_true, y_pred):
    """Calculates the overall SMAPE for a set of predictions."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

def individual_smape(y_true, y_pred):
    """Calculates the SMAPE for each individual prediction."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Add a small epsilon to prevent division by zero for zero-priced items
    return (numerator / (denominator + 1e-8)) * 100

# --- 3. DATA LOADING & PREP ---
print("--- Loading Data and Preparing Features ---")
try:
    # Load all data, using 'sample_id' as the index for direct mapping
    train_df_full = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'), index_col='sample_id')
    test_df_full = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'), index_col='sample_id')
    with open(os.path.join(INPUT_DIR, 'final_embeddings.pkl'), 'rb') as f:
        image_embeddings_dict = pickle.load(f)
except FileNotFoundError as e:
    print(f"Error loading files: {e}. Please check your INPUT_DIR path.")
    exit()

# Engineer features on the full training data
train_df = train_df_full.dropna(subset=['catalog_content', 'price', 'image_link']).copy()
train_df['log_price'] = np.log1p(train_df['price'])

units = ['gb', 'oz', 'inch', 'mah', 'count', 'pack']
for unit in units:
    regex = r'(\d+\.?\d*)\s?' + re.escape(unit)
    train_df[f'feat_{unit}'] = train_df['catalog_content'].str.extract(regex, flags=re.IGNORECASE).astype(float).fillna(0)

# Split the data
X_train_df, X_val_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_STATE)
y_train_log, y_val = X_train_df['log_price'], X_val_df['price']

# Create Feature Extractors (TF-IDF and Scaler)
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1, 2))
scaler = StandardScaler()
numerical_cols = [f'feat_{unit}' for unit in units]

# --- 4. TRAIN V8 MODEL (TEXT-ONLY) ---
print("\n--- Training V8 Model (Text + Numericals only) ---")
X_train_tfidf = tfidf.fit_transform(X_train_df['catalog_content'])
X_train_num = scaler.fit_transform(X_train_df[numerical_cols])
X_train_v8 = hstack([X_train_tfidf, X_train_num], format='csr')

model_v8 = lgb.LGBMRegressor(random_state=RANDOM_STATE)
model_v8.fit(X_train_v8, y_train_log)

# --- 5. TRAIN V15 MODEL (TEXT + NUMERICALS + IMAGES) ---
print("\n--- Training V15 Model (Text + Numericals + Images) ---")
# Correctly determine the max sample_id from the entire dataset
all_df = pd.concat([train_df_full, test_df_full])
max_id = all_df.index.max()
embedding_dim = 512 # From our CLIP model
embedding_matrix = np.zeros((max_id + 1, embedding_dim), dtype=np.float32)

# Populate the matrix using the full dataframe context
for sample_id, row in tqdm(all_df.iterrows(), desc="Mapping Embeddings"):
    if row['image_link'] in image_embeddings_dict:
        embedding = image_embeddings_dict.get(row['image_link'])
        if embedding is not None:
            embedding_matrix[sample_id] = embedding

# Create the V15 training feature set by adding image embeddings
X_train_img = embedding_matrix[X_train_df.index]
X_train_v15 = hstack([X_train_v8, X_train_img], format='csr')

model_v15 = lgb.LGBMRegressor(random_state=RANDOM_STATE)
model_v15.fit(X_train_v15, y_train_log)

# --- 6. PREPARE VALIDATION DATA & PREDICT ---
print("\n--- Generating Predictions for Both Models on Validation Set ---")
# Create V8 validation features
X_val_tfidf = tfidf.transform(X_val_df['catalog_content'])
X_val_num = scaler.transform(X_val_df[numerical_cols])
X_val_v8 = hstack([X_val_tfidf, X_val_num], format='csr')

# Create V15 validation features
X_val_img = embedding_matrix[X_val_df.index]
X_val_v15 = hstack([X_val_v8, X_val_img], format='csr')

# Make predictions
preds_v8_log = model_v8.predict(X_val_v8)
preds_v15_log = model_v15.predict(X_val_v15)

# Inverse transform predictions to original price scale
preds_v8 = np.expm1(preds_v8_log)
preds_v15 = np.expm1(preds_v15_log)

# --- 7. FINAL DIAGNOSTIC: COMPARE WORST 50 PREDICTIONS ---
print("\n--- Diagnostic: Comparing Performance on V8's 50 Worst Predictions ---")
# Create a comparison DataFrame
comparison_df = X_val_df[['price', 'catalog_content']].copy()
comparison_df['v8_pred'] = preds_v8
comparison_df['v15_pred'] = preds_v15
comparison_df['v8_error'] = individual_smape(comparison_df['price'], comparison_df['v8_pred'])
comparison_df['v15_error'] = individual_smape(comparison_df['price'], comparison_df['v15_pred'])
comparison_df['error_diff (v15 - v8)'] = comparison_df['v15_error'] - comparison_df['v8_error']

# A negative 'error_diff' means V15 (with images) performed BETTER on that sample.
# A positive 'error_diff' means V15 (with images) performed WORSE.

# Get the 50 worst predictions from our champion V8 model
worst_50_v8 = comparison_df.sort_values(by='v8_error', ascending=False).head(50)

print("Analysis of the 50 samples where the V8 (text-only) model performed worst:")
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 100)
display(worst_50_v8[['price', 'v8_pred', 'v8_error', 'v15_pred', 'v15_error', 'error_diff (v15 - v8)']])

# --- 8. QUANTITATIVE SUMMARY ---
v15_improvements = (worst_50_v8['error_diff (v15 - v8)'] < 0).sum()
v15_worsened = (worst_50_v8['error_diff (v15 - v8)'] > 0).sum()
avg_error_diff = worst_50_v8['error_diff (v15 - v8)'].mean()

print(f"\n--- Summary on the 50 Most Difficult Samples ---")
print(f"Number of times V15 (with images) improved the prediction: {v15_improvements} / 50")
print(f"Number of times V15 (with images) worsened the prediction: {v15_worsened} / 50")
print(f"Average change in SMAPE error after adding images: {avg_error_diff:+.4f} points")
if avg_error_diff > 0:
    print("\nConclusion: On the most difficult predictions, adding images made the model worse on average.")
else:
    print("\nConclusion: On the most difficult predictions, adding images provided a slight improvement on average.")

--- Loading Data and Preparing Features ---

--- Training V8 Model (Text + Numericals only) ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.186656 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1664300
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 29535
[LightGBM] [Info] Start training from score 2.740904

--- Training V15 Model (Text + Numericals + Images) ---


Mapping Embeddings: 0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.437131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1794860
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 30047
[LightGBM] [Info] Start training from score 2.740904

--- Generating Predictions for Both Models on Validation Set ---





--- Diagnostic: Comparing Performance on V8's 50 Worst Predictions ---
Analysis of the 50 samples where the V8 (text-only) model performed worst:


Unnamed: 0_level_0,price,v8_pred,v8_error,v15_pred,v15_error,error_diff (v15 - v8)
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
128897,0.98,49.759283,192.27423,35.870735,189.362492,-2.911739
189266,0.68,32.784902,191.872081,25.456094,189.592936,-2.279145
128524,1.99,89.237139,191.274526,58.038272,186.739582,-4.534945
81346,286.77,6.593781,191.009414,4.963803,193.194065,2.184651
189255,1.18,43.965555,189.544929,29.406008,184.568107,-4.976822
229126,2796.0,76.805319,189.305879,105.033939,185.517724,-3.788155
218412,600.59,18.021237,188.347294,21.057492,186.450525,-1.896768
86758,1.68,52.248101,187.538964,36.292188,182.302837,-5.236127
9697,177.51,5.934992,187.058809,6.926769,184.977466,-2.081343
188072,496.28,17.531823,186.351562,25.023422,180.799342,-5.55222



--- Summary on the 50 Most Difficult Samples ---
Number of times V15 (with images) improved the prediction: 38 / 50
Number of times V15 (with images) worsened the prediction: 12 / 50
Average change in SMAPE error after adding images: -4.2227 points

Conclusion: On the most difficult predictions, adding images provided a slight improvement on average.
