Instead of one model for everything, we'll build a system:

Stage 1 (The Router): A simple, fast classifier will first look at a product and predict if it's "low-price" (e.g., < $100) or "high-price" (e.g., >= $100).

Stage 2 (The Experts): We will train two separate, highly specialized LightGBM regressors:

One expert trained only on low-priced items.

Another expert trained only on high-priced items.

During prediction, the router first decides which price category the product belongs to, and then passes it to the appropriate expert for a precise price prediction. This forces our system to learn the distinct patterns of high-value items.

In [3]:
# ==============================================================================
# FINAL V14 SCRIPT: MIXTURE OF EXPERTS FOR PRICE PREDICTION
# ==============================================================================

# --- 1. SETUP & CONFIGURATION ---
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
import lightgbm as lgb
import os
import re
from tqdm.auto import tqdm

# --- CONFIGURATION ---
# Justification: Centralized parameters for easy modification and reproducibility.
INPUT_DIR = 'input/'
OUTPUT_DIR = 'output/'
MODEL_VERSION = 'embeddings_model_v14'
PRICE_THRESHOLD = 100.0  # Threshold to distinguish low vs. high price items
RANDOM_STATE = 42

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


# --- 2. HELPER FUNCTION ---
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100


# --- 3. OPTIMIZED EMBEDDING PRE-PROCESSING ---
print("--- [Step 1/9] Pre-processing embeddings for fast lookup ---")
# Justification: This one-time process avoids slow, repeated lookups later.
# We create a matrix where the row index directly corresponds to the sample_id.
try:
    train_df_full = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'), index_col='sample_id')
    test_df_full = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'), index_col='sample_id')
    with open(os.path.join(INPUT_DIR, 'final_embeddings.pkl'), 'rb') as f:
        image_embeddings_dict = pickle.load(f)
except FileNotFoundError as e:
    print(f"Error loading files: {e}. Please check your INPUT_DIR path.")
    exit()

all_df = pd.concat([train_df_full, test_df_full])
max_id = all_df.index.max()
embedding_dim = 512  # From our CLIP model
embedding_matrix = np.zeros((max_id + 1, embedding_dim), dtype=np.float32)

for sample_id, row in tqdm(all_df.iterrows(), total=len(all_df), desc="Mapping Embeddings"):
    embedding = image_embeddings_dict.get(row['image_link'])
    if embedding is not None:
        embedding_matrix[sample_id] = embedding
print("Embedding matrix created for fast indexing.")


# --- 4. DATA LOADING & FEATURE ENGINEERING ---
print("\n--- [Step 2/9] Loading Data and Engineering Features ---")
train_df = train_df_full.dropna(subset=['catalog_content', 'price', 'image_link']).copy()
train_df['is_high_price'] = (train_df['price'] >= PRICE_THRESHOLD).astype(int)
train_df['log_price'] = np.log1p(train_df['price'])

units = ['gb', 'oz', 'inch', 'mah', 'count', 'pack']
for unit in units:
    regex = r'(\d+\.?\d*)\s?' + re.escape(unit)
    train_df[f'feat_{unit}'] = train_df['catalog_content'].str.extract(regex, flags=re.IGNORECASE).astype(float).fillna(0)


# --- 5. TRAIN/VALIDATION SPLIT ---
print("\n--- [Step 3/9] Splitting Data into Train/Validation Sets ---")
X_train_df, X_val_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=train_df['is_high_price']
)


# --- 6. FEATURE MATRIX CONSTRUCTION ---
print("\n--- [Step 4/9] Constructing Feature Matrices ---")
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1, 2))
scaler = StandardScaler()
numerical_cols = [f'feat_{unit}' for unit in units]

# Fit on training data
X_train_tfidf = tfidf.fit_transform(X_train_df['catalog_content'])
X_train_num = scaler.fit_transform(X_train_df[numerical_cols])
X_train_img = embedding_matrix[X_train_df.index] # Fast lookup
X_train_full = hstack([X_train_tfidf, X_train_num, X_train_img], format='csr')

# Transform validation data
X_val_tfidf = tfidf.transform(X_val_df['catalog_content'])
X_val_num = scaler.transform(X_val_df[numerical_cols])
X_val_img = embedding_matrix[X_val_df.index] # Fast lookup
X_val_full = hstack([X_val_tfidf, X_val_num, X_val_img], format='csr')


# --- 7. STAGE 1: TRAIN THE ROUTER (CLASSIFIER) ---
print("\n--- [Step 5/9] Training Stage 1: Price Category Classifier ---")
router_model = LogisticRegression(random_state=RANDOM_STATE, solver='liblinear', class_weight='balanced')
router_model.fit(X_train_tfidf, X_train_df['is_high_price'])


# --- 8. STAGE 2: TRAIN THE EXPERTS (REGRESSORS) ---
print("\n--- [Step 6/9] Training Stage 2: Expert Regressors ---")
low_price_indices_train = X_train_df[X_train_df['is_high_price'] == 0].index
high_price_indices_train = X_train_df[X_train_df['is_high_price'] == 1].index

# Train Low-Price Expert
print("Training low-price expert...")
lgbm_low_price = lgb.LGBMRegressor(random_state=RANDOM_STATE)
lgbm_low_price.fit(X_train_full[X_train_df.index.isin(low_price_indices_train)], X_train_df.loc[low_price_indices_train, 'log_price'])

# Train High-Price Expert
# Justification: Using more estimators for the smaller high-price dataset can help it learn more complex patterns.
print("Training high-price expert...")
lgbm_high_price = lgb.LGBMRegressor(random_state=RANDOM_STATE, n_estimators=200, learning_rate=0.05)
lgbm_high_price.fit(X_train_full[X_train_df.index.isin(high_price_indices_train)], X_train_df.loc[high_price_indices_train, 'log_price'])


# --- 9. EVALUATION ON VALIDATION SET ---
print("\n--- [Step 7/9] Evaluating V14 Mixture of Experts Model ---")
val_categories = router_model.predict(X_val_tfidf)
val_predictions = np.zeros(len(X_val_df))

low_mask = (val_categories == 0)
high_mask = (val_categories == 1)

if low_mask.any():
    val_predictions[low_mask] = lgbm_low_price.predict(X_val_full[low_mask])
if high_mask.any():
    val_predictions[high_mask] = lgbm_high_price.predict(X_val_full[high_mask])

final_preds = np.expm1(val_predictions)
final_preds[final_preds < 0] = 0
v14_smape = smape(X_val_df['price'], final_preds)

print(f"V14 (Mixture of Experts) SMAPE: {v14_smape:.4f}")


# --- 10. SAVE MODEL ARTIFACTS ---
print("\n--- [Step 8/9] Saving V14 Model Artifacts ---")
artifacts = {
    'router': router_model,
    'expert_low_price': lgbm_low_price,
    'expert_high_price': lgbm_high_price,
    'tfidf_vectorizer': tfidf,
    'numerical_scaler': scaler,
    'numerical_columns': numerical_cols,
    'price_threshold': PRICE_THRESHOLD,
    'version': MODEL_VERSION
}
output_path = os.path.join(OUTPUT_DIR, f"{MODEL_VERSION}.pkl")
with open(output_path, 'wb') as f:
    pickle.dump(artifacts, f)
print(f"All V14 model components saved to: {output_path}")


# --- 11. GENERATE SUBMISSION FILE ---
print("\n--- [Step 9/9] Generating Final Submission File ---")
# This section shows how you would use the saved artifacts to predict on the test set.
# Prepare test features
test_df = test_df_full.dropna(subset=['catalog_content', 'image_link']).copy()
for unit in units:
    regex = r'(\d+\.?\d*)\s?' + re.escape(unit)
    test_df[f'feat_{unit}'] = test_df['catalog_content'].str.extract(regex, flags=re.IGNORECASE).astype(float).fillna(0)

X_test_tfidf = tfidf.transform(test_df['catalog_content'])
X_test_num = scaler.transform(test_df[numerical_cols])
X_test_img = embedding_matrix[test_df.index]
X_test_full = hstack([X_test_tfidf, X_test_num, X_test_img], format='csr')

# Predict categories and route to experts
test_categories = router_model.predict(X_test_tfidf)
test_predictions_log = np.zeros(len(test_df))
low_mask_test = (test_categories == 0)
high_mask_test = (test_categories == 1)

if low_mask_test.any():
    test_predictions_log[low_mask_test] = lgbm_low_price.predict(X_test_full[low_mask_test])
if high_mask_test.any():
    test_predictions_log[high_mask_test] = lgbm_high_price.predict(X_test_full[high_mask_test])

# Create submission DataFrame
submission_df = pd.DataFrame(index=test_df.index)
submission_df['price'] = np.expm1(test_predictions_log)
submission_df['price'] = submission_df['price'].clip(0) # Ensure no negative prices
submission_df.to_csv('submission.csv')

print("\nSubmission.csv created successfully.")

--- [Step 1/9] Pre-processing embeddings for fast lookup ---


Mapping Embeddings:   0%|          | 0/150000 [00:00<?, ?it/s]

Embedding matrix created for fast indexing.

--- [Step 2/9] Loading Data and Engineering Features ---

--- [Step 3/9] Splitting Data into Train/Validation Sets ---

--- [Step 4/9] Constructing Feature Matrices ---

--- [Step 5/9] Training Stage 1: Price Category Classifier ---

--- [Step 6/9] Training Stage 2: Expert Regressors ---
Training low-price expert...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.547821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1761914
[LightGBM] [Info] Number of data points in the train set: 58486, number of used features: 29989
[LightGBM] [Info] Start training from score 2.681081
Training high-price expert...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 185561
[LightGBM] [Info] Number of data points in the train set: 151



All V14 model components saved to: output/embeddings_model_v14.pkl

--- [Step 9/9] Generating Final Submission File ---





Submission.csv created successfully.


In [4]:
# --- V15 DIAGNOSTIC TEST: SINGLE MODEL WITH ALL FEATURES ---

print("\n--- [Diagnostic] Testing a Single LGBM on V8 + Image Features ---")

# We already have the feature matrices from the V14 script:
# X_train_full (hstack of TF-IDF, numericals, and images)
# X_val_full (hstack of TF-IDF, numericals, and images)
# y_train_log (the log-transformed price from the train set)
# X_val_df['price'] (the true price for the validation set)

# Train a single, standard LightGBM Regressor
single_lgbm_model = lgb.LGBMRegressor(random_state=RANDOM_STATE)
print("Training a single LightGBM model on all features...")
single_lgbm_model.fit(X_train_full, X_train_df['log_price'])

# Evaluate the single model
print("Evaluating the single model...")
preds_log = single_lgbm_model.predict(X_val_full)
preds = np.expm1(preds_log)
preds[preds < 0] = 0

v15_smape = smape(X_val_df['price'], final_preds)

print(f"\n--- Ablation Study Results ---")
print(f"V8 SMAPE (TF-IDF + Numericals only): 50.96")
print(f"V14 SMAPE (MoE on All Features): 56.97")
print(f"V15 SMAPE (Single LGBM on All Features): {v15_smape:.4f}")

if v15_smape < 52:
     print("\nConclusion: The Mixture of Experts architecture was the primary cause of failure.")
else:
     print("\nConclusion: The image embeddings themselves are adding noise and degrading performance.")


--- [Diagnostic] Testing a Single LGBM on V8 + Image Features ---
Training a single LightGBM model on all features...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.460737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1794068
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 30038
[LightGBM] [Info] Start training from score 2.739783
Evaluating the single model...

--- Ablation Study Results ---
V8 SMAPE (TF-IDF + Numericals only): 50.96
V14 SMAPE (MoE on All Features): 56.97
V15 SMAPE (Single LGBM on All Features): 56.9696

Conclusion: The image embeddings themselves are adding noise and degrading performance.




In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Running Error Diagnostic ---")

# 1. Make predictions on the validation set (if not already done)
# Ensure you have 'preds' from the V15 experiment
try:
    preds
except NameError:
    print("Predictions not found. Rerunning prediction step...")
    preds_log = single_lgbm_model.predict(X_val_full)
    preds = np.expm1(preds_log)
    preds[preds < 0] = 0

# 2. Create an Error Analysis DataFrame
error_df = X_val_df[['catalog_content', 'price']].copy()
error_df['predicted_price'] = preds

# Calculate the individual SMAPE component for each prediction
numerator = np.abs(error_df['predicted_price'] - error_df['price'])
denominator = (np.abs(error_df['price']) + np.abs(error_df['predicted_price'])) / 2
error_df['smape_error'] = (numerator / (denominator + 1e-8)) * 100 # Add epsilon for stability

# 3. Group by Price Brackets
# Define the bins for our price ranges
price_bins = [0, 10, 25, 50, 100, error_df['price'].max() + 1]
bin_labels = ['0-10', '10-25', '25-50', '50-100', '100+']
error_df['price_bracket'] = pd.cut(error_df['price'], bins=price_bins, labels=bin_labels, right=False)

# 4. Analyze the Error per Bracket
# Calculate the mean SMAPE for each group
error_summary = error_df.groupby('price_bracket')['smape_error'].mean().reset_index()
error_summary.rename(columns={'smape_error': 'average_smape'}, inplace=True)

# Also count the number of samples in each bracket
error_summary['sample_count'] = error_df.groupby('price_bracket').size().values

print("\n--- Error Analysis Summary ---")
print(error_summary)

# 5. Visualize the Results
print("\n--- Visualizing Error Distribution ---")
plt.figure(figsize=(12, 6))
sns.barplot(x='price_bracket', y='average_smape', data=error_summary, palette='viridis')
plt.title('Average SMAPE per Price Bracket', fontsize=16)
plt.xlabel('True Price Bracket ($)', fontsize=12)
plt.ylabel('Average SMAPE', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Display some examples of the worst predictions in the high-price bracket
print("\n--- Examples of High-Error Predictions in the '100+' Bracket ---")
high_price_errors = error_df[error_df['price_bracket'] == '100+'].sort_values(by='smape_error', ascending=False)
pd.set_option('display.max_colwidth', 200)
display(high_price_errors[['price', 'predicted_price', 'smape_error', 'catalog_content']].head(10))

ModuleNotFoundError: No module named 'matplotlib'

: 