In [1]:
# --- 1. SETUP & CONFIGURATION ---
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from scipy.sparse import hstack
import lightgbm as lgb
import os
import re
from tqdm.auto import tqdm

# --- CONFIGURATION ---
INPUT_DIR = 'input/'
OUTPUT_DIR = 'output/'
EXPERIMENT_NAME = 't-1_dynamic_model_router'
CHAMPION_BENCHMARK_SMAPE = 50.43
RANDOM_STATE = 42

os.makedirs(OUTPUT_DIR, exist_ok=True)

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

# --- 2. DATA & EMBEDDING PREP ---
print("--- Loading Data and Pre-processing Embeddings ---")
train_df_full = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'), index_col='sample_id')
test_df_full = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'), index_col='sample_id')
with open(os.path.join(INPUT_DIR, 'final_embeddings.pkl'), 'rb') as f:
    image_embeddings_dict = pickle.load(f)

# Create the fast lookup matrix for embeddings
all_df = pd.concat([train_df_full, test_df_full])
max_id = all_df.index.max()
embedding_matrix = np.zeros((max_id + 1, 512), dtype=np.float32)
for sample_id, row in tqdm(all_df.iterrows(), desc="Mapping Embeddings"):
    embedding = image_embeddings_dict.get(row['image_link'])
    if embedding is not None:
        embedding_matrix[sample_id] = embedding

# --- 3. FEATURE ENGINEERING (V16 Pipeline) ---
print("\n--- Engineering V16 Features ---")
train_df = train_df_full.dropna(subset=['catalog_content', 'price', 'image_link']).copy()
train_df['log_price'] = np.log1p(train_df['price'])

# Comprehensive numerical features from V16 summary
units = ['gb', 'oz', 'inch', 'mah', 'count', 'pack', 'mp', 'lbs', 'watts']
for unit in units:
    regex = r'(\d+\.?\d*)\s?' + re.escape(unit)
    train_df[f'feat_{unit}'] = train_df['catalog_content'].str.extract(regex, flags=re.IGNORECASE).astype(float).fillna(0)
train_df['feat_premium_keyword_count'] = train_df['catalog_content'].str.lower().str.count('pro|plus|premium|deluxe')
train_df['feat_word_count'] = train_df['catalog_content'].str.split().str.len()

# --- 4. DATA SPLIT & FEATURE MATRIX CONSTRUCTION ---
print("\n--- Splitting Data and Building Feature Matrices ---")
X_train_df, X_val_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_STATE)
y_train_log, y_val = X_train_df['log_price'], X_val_df['price']

tfidf = TfidfVectorizer(max_features=40000, ngram_range=(1, 2))
numerical_cols = [col for col in train_df.columns if col.startswith('feat_')]
scaler = StandardScaler()

# Create V16 (text-only) features
X_train_tfidf = tfidf.fit_transform(X_train_df['catalog_content'])
X_train_num = scaler.fit_transform(X_train_df[numerical_cols])
X_train_v16 = hstack([X_train_tfidf, X_train_num], format='csr')

# Create V17 (text + image) features
X_train_img = embedding_matrix[X_train_df.index]
X_train_v17 = hstack([X_train_v16, X_train_img], format='csr')

# --- 5. TRAIN THE ROUTER & EXPERTS ---
print("\n--- Training Screener and Expert Models ---")
# Train Screener (fast model)
screener_model = Ridge(random_state=RANDOM_STATE)
screener_model.fit(X_train_tfidf, y_train_log)

# Train V16 Expert (text-only)
expert_v16 = lgb.LGBMRegressor(random_state=RANDOM_STATE)
expert_v16.fit(X_train_v16, y_train_log)

# Train V17 Expert (text + image)
expert_v17 = lgb.LGBMRegressor(random_state=RANDOM_STATE)
expert_v17.fit(X_train_v17, y_train_log)

# --- 6. TUNE THE ROUTING THRESHOLD ---
print("\n--- Tuning the Routing Threshold on Validation Set ---")
# Prepare validation features
X_val_tfidf = tfidf.transform(X_val_df['catalog_content'])
X_val_num = scaler.transform(X_val_df[numerical_cols])
X_val_img = embedding_matrix[X_val_df.index]
X_val_v16 = hstack([X_val_tfidf, X_val_num], format='csr')
X_val_v17 = hstack([X_val_v16, X_val_img], format='csr')

# Get predictions from screener and both experts
screener_preds_log = screener_model.predict(X_val_tfidf)
screener_preds = np.expm1(screener_preds_log)
expert_v16_preds = np.expm1(expert_v16.predict(X_val_v16))
expert_v17_preds = np.expm1(expert_v17.predict(X_val_v17))

best_smape = float('inf')
best_threshold = 0
thresholds_to_test = np.arange(50, 151, 5) # Test thresholds from $50 to $150

for threshold in tqdm(thresholds_to_test, desc="Tuning Threshold"):
    # Apply routing logic
    final_preds = np.where(screener_preds >= threshold, expert_v17_preds, expert_v16_preds)
    current_smape = smape(y_val, final_preds)
    if current_smape < best_smape:
        best_smape = current_smape
        best_threshold = threshold

print(f"\nOptimal Routing Threshold found: ${best_threshold}")
print(f"V16 (Text-Only) SMAPE: {smape(y_val, expert_v16_preds):.4f}")
print(f"V17 (Text+Image) SMAPE: {smape(y_val, expert_v17_preds):.4f}")
print(f"T-1 Dynamic Router SMAPE: {best_smape:.4f}")

# --- 7. FINAL EVALUATION & CONCLUSION ---
improvement = CHAMPION_BENCHMARK_SMAPE - best_smape
if improvement > 0:
    print(f"\n✅ Success. The Dynamic Model Router improved the SMAPE by {improvement:.4f} points.")
else:
    print(f"\n❌ Failure. The Dynamic Model Router did not improve upon the best single model.")

# --- 8. SAVE THE FINAL ARTIFACTS ---
print("\n--- Saving T-1 Model Artifacts ---")
artifacts = {
    'screener_model': screener_model,
    'expert_v16_model': expert_v16,
    'expert_v17_model': expert_v17,
    'tfidf_vectorizer': tfidf,
    'numerical_scaler': scaler,
    'numerical_columns': numerical_cols,
    'routing_threshold': best_threshold,
    'version': EXPERIMENT_NAME
}
output_path = os.path.join(OUTPUT_DIR, f"{EXPERIMENT_NAME}.pkl")
with open(output_path, 'wb') as f:
    pickle.dump(artifacts, f)
print(f"All T-1 model components saved to: {output_path}")

--- Loading Data and Pre-processing Embeddings ---


Mapping Embeddings: 0it [00:00, ?it/s]


--- Engineering V16 Features ---

--- Splitting Data and Building Feature Matrices ---

--- Training Screener and Expert Models ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 6.733310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1780064
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 38631
[LightGBM] [Info] Start training from score 2.740904
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 7.221616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1910624
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 39143
[LightGBM] [Info] Start training from score 2.740904

--- Tuning the Routing Threshold on Validation Set ---




Tuning Threshold:   0%|          | 0/21 [00:00<?, ?it/s]


Optimal Routing Threshold found: $55
V16 (Text-Only) SMAPE: 55.8953
V17 (Text+Image) SMAPE: 55.1005
T-1 Dynamic Router SMAPE: 55.9075

❌ Failure. The Dynamic Model Router did not improve upon the best single model.

--- Saving T-1 Model Artifacts ---
All T-1 model components saved to: output/t-1_dynamic_model_router.pkl
