Building the Two-Stage Model
This is a multi-step process. We will build and train the three necessary models (one classifier, two regressors) and then combine them into a single prediction pipeline.


In [2]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer # <-- FIX: Added missing import
import warnings
warnings.filterwarnings("ignore")

# ==============================================================================
# ## 1. Data Preparation
# ==============================================================================
print("--- Step 1: Preparing Data ---")
df = pd.read_csv('input/train.csv')
df = df.dropna(subset=['price'])
df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

# --- Feature Engineering ---
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count']
    for p in patterns:
        m=re.search(p,text)
        if m: return int(m.group(1))
    return 1
df['quantity'] = df['catalog_content'].apply(extract_quantity)

def extract_brand(text):
    match = re.search(r'^(?:brand|manufacturer):\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    if match and match.group(1): return match.group(1).strip().lower()
    return 'unknown'
df['brand'] = df['catalog_content'].apply(extract_brand)

# --- Define the new classification target and split the data ---
PRICE_THRESHOLD = 100
df['is_expensive'] = (df['price'] >= PRICE_THRESHOLD).astype(int)

# --- Define feature sets ---
feature_cols = ['catalog_content', 'quantity', 'brand']
X = df[feature_cols]
y_reg = df['price']
y_clf = df['is_expensive']

# Create a final hold-out set for the entire system, stratified by price category
X_train, X_val, y_reg_train, y_reg_val, y_clf_train, y_clf_val = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# ==============================================================================
# ## 2. Train the "Triage" Classifier
# ==============================================================================
print("\n--- Step 2: Training the Triage Classifier ---")
clf_preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2)), 'catalog_content'),
    ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']),
    ('numeric', 'passthrough', ['quantity'])
])
triage_classifier = Pipeline(steps=[
    ('preprocessor', clf_preprocessor),
    ('classifier', lgb.LGBMClassifier(random_state=42))
])
triage_classifier.fit(X_train, y_clf_train)

# Evaluate the classifier's performance on the hold-out set
clf_preds = triage_classifier.predict(X_val)
print(f"Triage Classifier Accuracy: {accuracy_score(y_clf_val, clf_preds):.2%}")
print("Confusion Matrix:\n", confusion_matrix(y_clf_val, clf_preds))


# ==============================================================================
# ## 3. Train the "Specialist" Regressors
# ==============================================================================
print("\n--- Step 3: Training Specialist Regressors ---")
best_params = { 'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06, 'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5, 'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
reg_preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
    ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']),
    ('numeric', 'passthrough', ['quantity'])
])
create_regressor_pipeline = lambda: Pipeline(steps=[('preprocessor', reg_preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

print("Training Low-Price Expert...")
low_price_expert = create_regressor_pipeline()
low_price_train_df = X_train.join(y_reg_train).query(f'price < {PRICE_THRESHOLD}')
low_price_expert.fit(low_price_train_df[feature_cols], np.log1p(low_price_train_df['price']))

print("Training High-Price Expert...")
high_price_expert = create_regressor_pipeline()
high_price_train_df = X_train.join(y_reg_train).query(f'price >= {PRICE_THRESHOLD}')
high_price_expert.fit(high_price_train_df[feature_cols], np.log1p(high_price_train_df['price']))
print("Specialist models are trained.")

# ==============================================================================
# ## 4. Evaluate the Full V13 Two-Stage System
# ==============================================================================
print("\n--- Step 4: Evaluating the Full V13 System ---")
val_price_segments = triage_classifier.predict(X_val)
X_val_low = X_val[val_price_segments == 0]
X_val_high = X_val[val_price_segments == 1]

final_predictions = pd.Series(index=X_val.index, dtype=float)

if not X_val_low.empty:
    low_preds_log = low_price_expert.predict(X_val_low)
    final_predictions.loc[X_val_low.index] = np.expm1(low_preds_log)
if not X_val_high.empty:
    high_preds_log = high_price_expert.predict(X_val_high)
    final_predictions.loc[X_val_high.index] = np.expm1(high_preds_log)

final_predictions.fillna(0, inplace=True)
final_predictions[final_predictions < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v13_smape = smape(y_reg_val, final_predictions)

print("\n--- Model Performance Comparison ---")
print(f"Single Model (V8-like) SMAPE: 50.9553")
print(f"Two-Stage Model (V13) SMAPE: {v13_smape:.4f}")

--- Step 1: Preparing Data ---

--- Step 2: Training the Triage Classifier ---
[LightGBM] [Info] Number of positive: 1514, number of negative: 58486
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.862272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 906686
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 9979
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.025233 -> initscore=-3.654032
[LightGBM] [Info] Start training from score -3.654032
Triage Classifier Accuracy: 97.88%
Confusion Matrix:
 [[14587    34]
 [  284    95]]

--- Step 3: Training Specialist Regressors ---
Training Low-Price Expert...
Training High-Price Expert...
Specialist models are trained.

--- Step 4: Evaluating the Full V13 System ---

--- Model Performance Comparison ---
Single Model (V8-like) SMAPE: 50.9553
Two-Stage Mod