In [2]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

# ==============================================================================
# ## 1. Data Preparation
# ==============================================================================
print("--- Step 1: Preparing Data ---")
df = pd.read_csv('input/train.csv')
df = df.dropna(subset=['price'])
df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count']
    for p in patterns:
        m=re.search(p,text)
        if m: return int(m.group(1))
    return 1
df['quantity'] = df['catalog_content'].apply(extract_quantity)

def extract_brand(text):
    match = re.search(r'^(?:brand|manufacturer):\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    if match and match.group(1): return match.group(1).strip().lower()
    return 'unknown'
df['brand'] = df['catalog_content'].apply(extract_brand)

PRICE_THRESHOLD = 100
df['is_expensive'] = (df['price'] >= PRICE_THRESHOLD).astype(int)
feature_cols = ['catalog_content', 'quantity', 'brand']
X = df[feature_cols]
y_reg = df['price']
y_clf = df['is_expensive']

X_train, X_val, y_reg_train, y_reg_val, y_clf_train, y_clf_val = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# ==============================================================================
# ## 2. Train the "Triage" Classifier
# ==============================================================================
print("\n--- Step 2: Training the Triage Classifier ---")
clf_preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2)), 'catalog_content'),
    ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']),
    ('numeric', 'passthrough', ['quantity'])
])
triage_classifier = Pipeline(steps=[
    ('preprocessor', clf_preprocessor),
    ('classifier', lgb.LGBMClassifier(random_state=42))
])
triage_classifier.fit(X_train, y_clf_train)
clf_preds = triage_classifier.predict(X_val)
print(f"Triage Classifier Accuracy: {accuracy_score(y_clf_val, clf_preds):.2%}")

# ==============================================================================
# ## 3. Train the "Specialist" Regressors
# ==============================================================================
print("\n--- Step 3: Training Specialist Regressors ---")
best_params = { 'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06, 'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5, 'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }

# --- FIX: Define the preprocessor INSIDE the factory function ---
# This ensures each pipeline gets its own unique copy.
def create_regressor_pipeline():
    reg_preprocessor = ColumnTransformer(transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
        ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']),
        ('numeric', 'passthrough', ['quantity'])
    ])
    return Pipeline(steps=[('preprocessor', reg_preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

print("Training Low-Price Expert...")
low_price_expert = create_regressor_pipeline()
low_price_train_df = X_train.join(y_reg_train).query(f'price < {PRICE_THRESHOLD}')
low_price_expert.fit(low_price_train_df[feature_cols], np.log1p(low_price_train_df['price']))

print("Training High-Price Expert...")
high_price_expert = create_regressor_pipeline()
high_price_train_df = X_train.join(y_reg_train).query(f'price >= {PRICE_THRESHOLD}')
high_price_expert.fit(high_price_train_df[feature_cols], np.log1p(high_price_train_df['price']))
print("Specialist models are trained.")

# ==============================================================================
# ## 4. Evaluate the Full V13 Two-Stage System
# ==============================================================================
print("\n--- Step 4: Evaluating the Full V13 System ---")
val_price_segments = triage_classifier.predict(X_val)
X_val_low = X_val[val_price_segments == 0]
X_val_high = X_val[val_price_segments == 1]
final_predictions = pd.Series(index=X_val.index, dtype=float)

if not X_val_low.empty:
    low_preds_log = low_price_expert.predict(X_val_low)
    final_predictions.loc[X_val_low.index] = np.expm1(low_preds_log)
if not X_val_high.empty:
    high_preds_log = high_price_expert.predict(X_val_high)
    final_predictions.loc[X_val_high.index] = np.expm1(high_preds_log)

final_predictions.fillna(0, inplace=True); final_predictions[final_predictions < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v13_smape = smape(y_reg_val, final_predictions)

print("\n--- Model Performance Comparison ---")
print(f"Single Model (V8-like) SMAPE: 50.9553")
print(f"Two-Stage Model (V13) SMAPE: {v13_smape:.4f}")

--- Step 1: Preparing Data ---

--- Step 2: Training the Triage Classifier ---
Triage Classifier Accuracy: 97.88%

--- Step 3: Training Specialist Regressors ---
Training Low-Price Expert...
Training High-Price Expert...
Specialist models are trained.

--- Step 4: Evaluating the Full V13 System ---

--- Model Performance Comparison ---
Single Model (V8-like) SMAPE: 50.9553
Two-Stage Model (V13) SMAPE: 51.1317


In [None]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

# ==============================================================================
# ## 1. Data Preparation
# ==============================================================================
print("--- Step 1: Preparing Data ---")
df = pd.read_csv('input/train.csv')
df = df.dropna(subset=['price'])
df['catalog_content'] = df['catalog_content'].astype(str).fillna('')

# Feature Engineering
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count']
    for p in patterns:
        m=re.search(p,text)
        if m: return int(m.group(1))
    return 1
df['quantity'] = df['catalog_content'].apply(extract_quantity)

def extract_brand(text):
    match = re.search(r'^(?:brand|manufacturer):\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    if match and match.group(1): return match.group(1).strip().lower()
    return 'unknown'
df['brand'] = df['catalog_content'].apply(extract_brand)

PRICE_THRESHOLD = 100
df['is_expensive'] = (df['price'] >= PRICE_THRESHOLD).astype(int)
feature_cols = ['catalog_content', 'quantity', 'brand']
X = df[feature_cols]
y_reg = df['price']
y_clf = df['is_expensive']

X_train, X_val, y_reg_train, y_reg_val, y_clf_train, y_clf_val = train_test_split(
    X, y_reg, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

# ==============================================================================
# ## 2. Train the "Triage" Classifier
# ==============================================================================
print("\n--- Step 2: Training the Triage Classifier ---")
# ... (training code remains the same)
clf_preprocessor = ColumnTransformer(transformers=[('text', TfidfVectorizer(max_features=10000),'catalog_content'), ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']), ('numeric', 'passthrough', ['quantity'])])
triage_classifier = Pipeline(steps=[('preprocessor', clf_preprocessor), ('classifier', lgb.LGBMClassifier(random_state=42))])
triage_classifier.fit(X_train, y_clf_train)
print("Triage Classifier Trained.")

# ==============================================================================
# ## 3. Train the "Specialist" Regressors
# ==============================================================================
print("\n--- Step 3: Training Specialist Regressors ---")
# ... (training code remains the same)
best_params = { 'objective': 'regression_l1', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
def create_regressor_pipeline():
    reg_preprocessor = ColumnTransformer(transformers=[('text', TfidfVectorizer(max_features=30000),'catalog_content'), ('brand', OneHotEncoder(handle_unknown='ignore', min_frequency=5), ['brand']), ('numeric', 'passthrough', ['quantity'])])
    return Pipeline(steps=[('preprocessor', reg_preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])
low_price_expert = create_regressor_pipeline()
low_price_train_df = X_train.join(y_reg_train).query(f'price < {PRICE_THRESHOLD}')
low_price_expert.fit(low_price_train_df[feature_cols], np.log1p(low_price_train_df['price']))
high_price_expert = create_regressor_pipeline()
high_price_train_df = X_train.join(y_reg_train).query(f'price >= {PRICE_THRESHOLD}')
high_price_expert.fit(high_price_train_df[feature_cols], np.log1p(high_price_train_df['price']))
print("Specialist models are trained.")

# ==============================================================================
# ## 4. Evaluate the Full V13 Two-Stage System
# ==============================================================================
print("\n--- Step 4: Evaluating the Full V13 System ---")
# ... (evaluation code remains the same)
val_price_segments = triage_classifier.predict(X_val)
X_val_low = X_val[val_price_segments == 0]
X_val_high = X_val[val_price_segments == 1]
final_predictions = pd.Series(index=X_val.index, dtype=float)
if not X_val_low.empty: final_predictions.loc[X_val_low.index] = np.expm1(low_price_expert.predict(X_val_low))
if not X_val_high.empty: final_predictions.loc[X_val_high.index] = np.expm1(high_price_expert.predict(X_val_high))
final_predictions.fillna(0, inplace=True); final_predictions[final_predictions < 0] = 0
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true); denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100
v13_smape = smape(y_reg_val, final_predictions)
print(f"Two-Stage Model (V13) SMAPE: {v13_smape:.4f}")

# ==============================================================================
# ## 5. NEW: Detailed Error Analysis of V13 Predictions
# ==============================================================================
print("\n--- Step 5: Performing Detailed Error Analysis on V13 ---")
analysis_df = X_val.copy()
analysis_df['price'] = y_reg_val
analysis_df['predicted_price'] = final_predictions
analysis_df['true_price_category'] = y_clf_val
analysis_df['predicted_price_category'] = val_price_segments
analysis_df['expert_used'] = np.where(analysis_df['predicted_price_category'] == 0, 'Low-Price', 'High-Price')
analysis_df['smape_error'] = smape(analysis_df['price'], analysis_df['predicted_price'])

worst_predictions = analysis_df.sort_values(by='smape_error', ascending=False)

print("\n--- Top 50 Worst Predictions with Diagnostic Info ---")
pd.set_option('display.max_colwidth', 150)
display_cols = ['price', 'predicted_price', 'smape_error', 'true_price_category', 'predicted_price_category', 'expert_used', 'catalog_content']
print(worst_predictions[display_cols].head(50))

--- Step 1: Preparing Data ---

--- Step 2: Training the Triage Classifier ---
Triage Classifier Trained.

--- Step 3: Training Specialist Regressors ---
Specialist models are trained.

--- Step 4: Evaluating the Full V13 System ---
Two-Stage Model (V13) SMAPE: 51.7825

--- Step 5: Performing Detailed Error Analysis on V13 ---

--- Top 50 Worst Predictions with Diagnostic Info ---
         price  predicted_price  smape_error  true_price_category  \
28853   29.990        11.488688    51.782502                    0   
60996  176.490       153.864357    51.782502                    1   
3892     8.950        20.669782    51.782502                    0   
12879    2.580         6.451921    51.782502                    0   
27111   27.940        10.426176    51.782502                    0   
24007   17.600        14.227550    51.782502                    0   
8122    15.850         7.377768    51.782502                    0   
14650    4.970         5.361955    51.782502                    

In [4]:
print("\n--- Top 50 Worst Predictions with Diagnostic Info ---")
pd.set_option('display.max_colwidth', 1000)
display_cols = ['price', 'predicted_price', 'smape_error', 'true_price_category', 'predicted_price_category', 'expert_used', 'catalog_content']
print(worst_predictions[display_cols].head(50))


--- Top 50 Worst Predictions with Diagnostic Info ---
         price  predicted_price  smape_error  true_price_category  \
28853   29.990        11.488688    51.782502                    0   
60996  176.490       153.864357    51.782502                    1   
3892     8.950        20.669782    51.782502                    0   
12879    2.580         6.451921    51.782502                    0   
27111   27.940        10.426176    51.782502                    0   
24007   17.600        14.227550    51.782502                    0   
8122    15.850         7.377768    51.782502                    0   
14650    4.970         5.361955    51.782502                    0   
26448   13.970         7.925297    51.782502                    0   
6166    19.420        12.299703    51.782502                    0   
56325    5.990        11.655672    51.782502                    0   
68186   19.990        20.850944    51.782502                    0   
27739   20.440        22.965935    51.782502    