Our system will work as follows:

Train our champion V8 model as usual to get a base_prediction.

Extract the quantity and category features from the text.

Apply a set of corrective rules to the base_prediction based on the features.

The rules will be designed to fix the two error types we know exist:

Rule 1 (Fixing Under-prediction): The "Bulk Item" Correction. If the model predicts a very low price for an item that has a large quantity and does not seem to be a cheap grocery item, we hypothesize the model has predicted the unit price. We will correct this by multiplying by the quantity.

Rule 2 (Fixing Over-prediction): The "Grocery Case" Correction. If the model predicts a very high price for an item that has a large quantity and seems to be a cheap grocery item, we hypothesize the model has predicted the case price when the label is the unit price. We will correct this by dividing by the quantity.

In [1]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# --- 1. Load Data and Engineer All V8 Features ---
train_df = pd.read_csv('input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*count', r'(\d+)\s*per case', r'(\d+)\s*pk'];
    for p in patterns:
        if m:=re.search(p,text): return int(m.group(1))
    return 1
def extract_numerical_features(text):
    text = text.lower(); features = {}; unit_map = {'gb': ['gb'],'oz': ['oz', 'ounce'],'inch': ['inch', '"'],'mp': ['mp'],'lbs': ['lb', 'lbs']}
    for fn, u in unit_map.items():
        if m := re.search(fr'(\d+\.?\d*)\s*(?:{"|".join(u)})', text): features[f'feat_{fn}'] = float(m.group(1))
    return features
numerical_features_df = pd.json_normalize(train_df['catalog_content'].apply(extract_numerical_features))
train_df = pd.concat([train_df.reset_index(drop=True), numerical_features_df], axis=1)
train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
def categorize_product(text):
    if any(k in text.lower() for k in ['coffee', 'tea', 'snack', 'candy', 'gum', 'soda', 'sauce']): return 'grocery'
    return 'non-grocery'
train_df['category'] = train_df['catalog_content'].apply(categorize_product)
print("Features created.")

# --- 2. Create Hold-Out Set ---
numerical_cols = [col for col in train_df.columns if col.startswith('feat_')]
X = train_df[['catalog_content', 'quantity', 'category'] + numerical_cols]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 3. Train the V8 Model ---
print("\nTraining the base V8 model...")
best_params = { 'objective': 'regression_l1', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
numeric_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
    ('numeric', numeric_pipeline, ['quantity'] + numerical_cols)
], remainder='drop')
pipeline_v8 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])
pipeline_v8.fit(X_train, y_train_log)
base_preds_log = pipeline_v8.predict(X_val)
base_preds = np.expm1(base_preds_log)
base_preds[base_preds < 0] = 0

# --- 4. Apply the Post-Processing Rule Engine ---
print("Applying post-processing rule engine...")
analysis_df = X_val.copy()
analysis_df['base_prediction'] = base_preds

def apply_price_rules(row):
    # Rule 1: Fix Under-prediction for expensive-seeming bulk items
    if row['quantity'] > 1 and row['base_prediction'] < 20 and row['category'] != 'grocery':
        return row['base_prediction'] * row['quantity']
    # Rule 2: Fix Over-prediction for cheap grocery bulk items
    if row['quantity'] > 1 and row['base_prediction'] > 50 and row['category'] == 'grocery':
        return row['base_prediction'] / row['quantity']
    # If no rule applies, keep the original prediction
    return row['base_prediction']

analysis_df['corrected_prediction'] = analysis_df.apply(apply_price_rules, axis=1)

# --- 5. Evaluate and Compare ---
def smape(y_true, y_pred):
    num = np.abs(y_pred - y_true); den = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(num, den, out=np.zeros_like(num, dtype=float), where=den!=0)) * 100

base_smape = smape(y_val, analysis_df['base_prediction'])
corrected_smape = smape(y_val, analysis_df['corrected_prediction'])

print("\n--- Model Performance Comparison ---")
print(f"V8 Model SMAPE (before rules): {base_smape:.4f}")
print(f"V17 Model SMAPE (after rules): {corrected_smape:.4f}")

Features created.

Training the base V8 model...
Applying post-processing rule engine...

--- Model Performance Comparison ---
V8 Model SMAPE (before rules): 51.1591
V17 Model SMAPE (after rules): 61.6941
