In [3]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# ==============================================================================
# ## 1. Data Preparation
# ==============================================================================
print("--- Step 1: Preparing Data ---")
df = pd.read_csv('input/train.csv')
df = df.dropna(subset=['price'])
df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

# Feature Engineering
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*count']
    for p in patterns:
        m=re.search(p,text)
        if m: return int(m.group(1))
    return 1
df['quantity'] = df['catalog_content'].apply(extract_quantity)
def extract_brand(text):
    match = re.search(r'^(?:brand|manufacturer):\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    if match and match.group(1): return match.group(1).strip().lower()
    return 'unknown'
df['brand'] = df['catalog_content'].apply(extract_brand)

PRICE_THRESHOLD = 100
df['is_expensive'] = (df['price'] >= PRICE_THRESHOLD).astype(int)
feature_cols = ['catalog_content', 'quantity', 'brand']
X = df[feature_cols]
y_reg = df['price']
y_clf = df['is_expensive']

X_train, X_val, y_reg_train, y_reg_val, y_clf_train, y_clf_val = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# ==============================================================================
# ## 2. Train Triage Classifier
# ==============================================================================
print("\n--- Step 2: Training the Triage Classifier ---")
clf_preprocessor = ColumnTransformer(transformers=[('text', TfidfVectorizer(max_features=10000),'catalog_content'), ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']), ('numeric', 'passthrough', ['quantity'])])
triage_classifier = Pipeline(steps=[('preprocessor', clf_preprocessor), ('classifier', lgb.LGBMClassifier(random_state=42))])
triage_classifier.fit(X_train, y_clf_train)
print(f"Triage Classifier Accuracy: {accuracy_score(y_clf_val, triage_classifier.predict(X_val)):.2%}")

# ==============================================================================
# ## 3. Train Specialist Regressors (Simplified)
# ==============================================================================
print("\n--- Step 3: Training Specialist Regressors ---")
best_params = { 'objective': 'regression_l1', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
def create_regressor_pipeline():
    reg_preprocessor = ColumnTransformer(transformers=[('text', TfidfVectorizer(max_features=30000),'catalog_content'), ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']), ('numeric', 'passthrough', ['quantity'])])
    return Pipeline(steps=[('preprocessor', reg_preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

print("Training Low-Price Expert...")
low_price_expert = create_regressor_pipeline()
low_price_train_df = X_train.join(y_reg_train).query(f'price < {PRICE_THRESHOLD}')
low_price_expert.fit(low_price_train_df[feature_cols], np.log1p(low_price_train_df['price']))

print("Training High-Price Expert (from scratch)...")
high_price_expert = create_regressor_pipeline()
high_price_train_df = X_train.join(y_reg_train).query(f'price >= {PRICE_THRESHOLD}')
high_price_expert.fit(high_price_train_df[feature_cols], np.log1p(high_price_train_df['price']))
print("Specialist models are ready.")

# ==============================================================================
# ## 4. Evaluate the V14 "Soft Blend" System
# ==============================================================================
print("\n--- Step 4: Evaluating the V14 System ---")
classifier_probs = triage_classifier.predict_proba(X_val)
prob_low = classifier_probs[:, 0]
prob_high = classifier_probs[:, 1]

# Get predictions from both experts on the full validation set
low_preds_log = low_price_expert.predict(X_val)
high_preds_log = high_price_expert.predict(X_val)

# Combine using probabilities as weights
final_preds_log = (prob_low * low_preds_log) + (prob_high * high_preds_log)
final_predictions = np.expm1(final_preds_log)
final_predictions[final_predictions < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v14_smape = smape(y_reg_val, final_predictions)

print("\n--- Model Performance Comparison ---")
print(f"Single Model (V8-like) SMAPE: 50.9553")
print(f"Two-Stage Hard-Switch (V13) SMAPE: 51.1317")
print(f"Two-Stage Soft-Blend (V14) SMAPE: {v14_smape:.4f}")

--- Step 1: Preparing Data ---

--- Step 2: Training the Triage Classifier ---
Triage Classifier Accuracy: 97.97%

--- Step 3: Training Specialist Regressors ---
Training Low-Price Expert...
Training High-Price Expert (from scratch)...
Specialist models are ready.

--- Step 4: Evaluating the V14 System ---

--- Model Performance Comparison ---
Single Model (V8-like) SMAPE: 50.9553
Two-Stage Hard-Switch (V13) SMAPE: 51.1317
Two-Stage Soft-Blend (V14) SMAPE: 51.8803
