In [1]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# --- 1. Load Data ---
train_df = pd.read_csv('input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
train_df['title'] = train_df['catalog_content'].apply(lambda t: (m.group(1).strip() if (m := re.search(r'^item name:\s*(.*)', t, re.I | re.M)) else t))

# --- 2. V16 Comprehensive Feature Engineering ---
print("Executing V16 Feature Engineering...")

# --- Base Feature: quantity ---
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*count', r'(\d+)\s*per case', r'(\d+)\s*pk', r'(\d+)\s*ct', r'(\d+)\s*ea']
    for p in patterns:
        if m:=re.search(p,text): return int(m.group(1))
    return 1
train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)

# --- Feature Set 1: Expanded Numerical Extraction ---
def extract_all_numerical_features(text):
    text = text.lower(); features = {}
    unit_map = {
        'gb': ['gb', 'gigabyte'], 'oz': ['oz', 'ounce'], 'inch': ['inch', '"'], 'mp': ['mp', 'megapixel'],
        'lbs': ['lb', 'lbs', 'pound'], 'mah': ['mah'], 'watts': ['watts', 'w'], 'volts': ['v', 'volt'],
        'cm': ['cm'], 'mm': ['mm'], 'fl_oz': ['fl oz', 'fluid ounce'], 'mg': ['mg'], 'g': ['g', 'gram'],
        'count': ['count'], 'servings': ['servings'], 'sheets': ['sheets'], 'capsules': ['capsules']
    }
    for fname, units in unit_map.items():
        if m := re.search(fr'(\d+\.?\d*)\s*(?:{"|".join(units)})', text):
            features[f'feat_{fname}'] = float(m.group(1))
    return features
numerical_features_df = pd.json_normalize(train_df['catalog_content'].apply(extract_all_numerical_features))
train_df = pd.concat([train_df.reset_index(drop=True), numerical_features_df], axis=1)

# --- Feature Set 2: Expanded "Premium" & "Condition" Keywords ---
PREMIUM_KEYWORDS = ['solid wood', 'leather', 'gold', 'cashmere', 'oled', '4k', 'uhd', 'stainless steel', 'crystal', 'gourmet', 'artisan', 'handcrafted', 'organic']
CONDITION_KEYWORDS = ['refurbished', 'used', 'open-box', 'pre-owned']
train_df['premium_keyword_count'] = train_df['catalog_content'].apply(lambda t: sum(k in t.lower() for k in PREMIUM_KEYWORDS))
train_df['condition_flag'] = train_df['catalog_content'].apply(lambda t: 1 if any(k in t.lower() for k in CONDITION_KEYWORDS) else 0)

# --- Feature Set 3: Text Statistics ---
train_df['title_length'] = train_df['title'].str.len().fillna(0)
train_df['content_word_count'] = train_df['catalog_content'].str.split().str.len().fillna(0)
print("All new features created.")

# --- 3. Create Hold-Out Set ---
numerical_cols = [col for col in train_df.columns if col.startswith('feat_')]
all_engineered_cols = ['quantity', 'premium_keyword_count', 'condition_flag', 'title_length', 'content_word_count'] + numerical_cols
X = train_df[['catalog_content'] + all_engineered_cols]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 4. Build and Train the V16 Pipeline ---
best_params = { 'objective': 'regression_l1', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
numeric_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=40000, ngram_range=(1, 2)), 'catalog_content'),
    ('numeric', numeric_pipeline, all_engineered_cols)
])
pipeline_v16 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])
print("\nTraining V16 model...")
pipeline_v16.fit(X_train, y_train_log)

# --- 5. Evaluate the V16 Model ---
print("\nEvaluating V16 model...")
val_preds_log = pipeline_v16.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0
def smape(y_true, y_pred):
    num = np.abs(y_pred - y_true); den = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(num, den, out=np.zeros_like(num, dtype=float), where=den!=0)) * 100
v16_smape = smape(y_val, val_preds)

print("\n--- Model Performance Comparison ---")
print(f"V8 Model SMAPE (previous champion): 50.9553")
print(f"V16 Model SMAPE (comprehensive FE): {v16_smape:.4f}")

# --- 6. Error Analysis of V16 ---
print("\n--- Top 50 Worst Predictions from V16 Model ---")
analysis_df = X_val.copy()
analysis_df['price'] = y_val
analysis_df['predicted_price'] = val_preds
analysis_df['smape_error'] = smape(analysis_df['price'], analysis_df['predicted_price'])
worst_predictions = analysis_df.sort_values(by='smape_error', ascending=False)
pd.set_option('display.max_colwidth', 200)
print(worst_predictions[['price', 'predicted_price', 'smape_error', 'catalog_content']].head(50))

Executing V16 Feature Engineering...
All new features created.

Training V16 model...

Evaluating V16 model...

--- Model Performance Comparison ---
V8 Model SMAPE (previous champion): 50.9553
V16 Model SMAPE (comprehensive FE): 50.4310

--- Top 50 Worst Predictions from V16 Model ---
         price  predicted_price  smape_error  \
26837   12.195        13.286542    50.430959   
56393   13.275         9.916119    50.430959   
61958    6.080         5.064664    50.430959   
61231   86.990        18.259314    50.430959   
6310     6.240         7.038325    50.430959   
10118    3.980        11.228441    50.430959   
14187   47.990        41.420492    50.430959   
22719   13.805        13.830897    50.430959   
69276   41.500        16.996625    50.430959   
38456   14.220        13.631195    50.430959   
26818   21.000        20.903630    50.430959   
51820  295.000        72.387183    50.430959   
20397   35.995        17.343791    50.430959   
55086    9.990        10.443947    50.4309

In [2]:
# This script loads the V16 model and correctly analyzes its errors.
# We assume the V16 model was trained and saved as 'lgbm_price_model_v16.joblib'
# If not, we will retrain it here for the analysis.

import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import joblib

# (Paste all the feature engineering functions from V16 here)
# ...

# Load data and create all V16 features...
# ... (Full V16 data prep)

# Create the same Hold-Out Set
# ... (train_test_split as in V16)

# Load or retrain the V16 pipeline
try:
    pipeline_v16 = joblib.load('lgbm_price_model_v16.joblib')
    print("Loaded V16 model from file.")
except FileNotFoundError:
    print("V16 model not found, retraining...")
    # (Paste the V16 pipeline definition and .fit() call here)
    # ...

# Make predictions on the validation set
val_preds_log = pipeline_v16.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

# --- FIX: Use the per-sample error calculation ---
def individual_smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0) * 100

analysis_df = X_val.copy()
analysis_df['price'] = y_val
analysis_df['predicted_price'] = val_preds
analysis_df['smape_error'] = individual_smape(analysis_df['price'], analysis_df['predicted_price'])

worst_predictions = analysis_df.sort_values(by='smape_error', ascending=False)

print("\n--- Top 50 Worst Predictions from V16 Model (Corrected) ---")
pd.set_option('display.max_colwidth', 200)
print(worst_predictions[['price', 'predicted_price', 'smape_error', 'catalog_content']].head(50))

V16 model not found, retraining...

--- Top 50 Worst Predictions from V16 Model (Corrected) ---
          price  predicted_price  smape_error  \
9273      1.990       122.050108   193.582721   
59934     0.980        59.900306   193.561136   
33685     0.680        34.586089   192.287208   
24856     1.180        59.930621   192.276302   
58617  2796.000        56.773403   192.039550   
38615     0.530        25.340689   191.805398   
18709   286.770         6.940905   190.547297   
30774   143.300         3.491971   190.484571   
47257   496.280        13.550221   189.368837   
28165     1.680        59.574483   189.029374   
22215     0.435        13.915794   187.875235   
6213    390.980        12.245774   187.852191   
74619   177.510         6.234516   186.427860   
49014     1.390        38.855887   186.184924   
59073     0.500        13.701287   185.916769   
36871    43.920         1.633836   185.653585   
26873     1.915        50.651975   185.428113   
2218    328.600       

In [None]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# --- 1. Load Data ---
train_df = pd.read_csv('input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
train_df['title'] = train_df['catalog_content'].apply(lambda t: (m.group(1).strip() if (m := re.search(r'^item name:\s*(.*)', t, re.I | re.M)) else t))

# --- 2. V16 Comprehensive Feature Engineering ---
print("Executing V16 Feature Engineering...")

# --- Base Feature: quantity ---
def extract_quantity(text):
    text = text.lower(); patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*count', r'(\d+)\s*per case', r'(\d+)\s*pk', r'(\d+)\s*ct', r'(\d+)\s*ea']
    for p in patterns:
        if m:=re.search(p,text): return int(m.group(1))
    return 1
train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)

# --- Feature Set 1: Expanded Numerical Extraction ---
def extract_all_numerical_features(text):
    text = text.lower(); features = {}
    unit_map = {
        'gb': ['gb', 'gigabyte'], 'oz': ['oz', 'ounce'], 'inch': ['inch', '"'], 'mp': ['mp', 'megapixel'],
        'lbs': ['lb', 'lbs', 'pound'], 'mah': ['mah'], 'watts': ['watts', 'w'], 'volts': ['v', 'volt'],
        'cm': ['cm'], 'mm': ['mm'], 'fl_oz': ['fl oz', 'fluid ounce'], 'mg': ['mg'], 'g': ['g', 'gram'],
        'count': ['count'], 'servings': ['servings'], 'sheets': ['sheets'], 'capsules': ['capsules']
    }
    for fname, units in unit_map.items():
        if m := re.search(fr'(\d+\.?\d*)\s*(?:{"|".join(units)})', text):
            features[f'feat_{fname}'] = float(m.group(1))
    return features
numerical_features_df = pd.json_normalize(train_df['catalog_content'].apply(extract_all_numerical_features))
train_df = pd.concat([train_df.reset_index(drop=True), numerical_features_df], axis=1)

# --- Feature Set 2: Expanded "Premium" & "Condition" Keywords ---
PREMIUM_KEYWORDS = ['solid wood', 'leather', 'gold', 'cashmere', 'oled', '4k', 'uhd', 'stainless steel', 'crystal', 'gourmet', 'artisan', 'handcrafted', 'organic']
CONDITION_KEYWORDS = ['refurbished', 'used', 'open-box', 'pre-owned']
train_df['premium_keyword_count'] = train_df['catalog_content'].apply(lambda t: sum(k in t.lower() for k in PREMIUM_KEYWORDS))
train_df['condition_flag'] = train_df['catalog_content'].apply(lambda t: 1 if any(k in t.lower() for k in CONDITION_KEYWORDS) else 0)

# --- Feature Set 3: Text Statistics ---
train_df['title_length'] = train_df['title'].str.len().fillna(0)
train_df['content_word_count'] = train_df['catalog_content'].str.split().str.len().fillna(0)
print("All new features created.")

# --- 3. Create Hold-Out Set ---
numerical_cols = [col for col in train_df.columns if col.startswith('feat_')]
all_engineered_cols = ['quantity', 'premium_keyword_count', 'condition_flag', 'title_length', 'content_word_count'] + numerical_cols
X = train_df[['catalog_content'] + all_engineered_cols]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 4. Build and Train the V16 Pipeline ---
best_params = { 'objective': 'regression_l1', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }
numeric_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0)), ('scaler', StandardScaler())])
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(stop_words='english', max_features=40000, ngram_range=(1, 2)), 'catalog_content'),
    ('numeric', numeric_pipeline, all_engineered_cols)
])
pipeline_v16 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])
print("\nTraining V16 model...")
pipeline_v16.fit(X_train, y_train_log)

# --- 5. Evaluate the V16 Model ---
print("\nEvaluating V16 model...")
val_preds_log = pipeline_v16.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0
def smape(y_true, y_pred):
    num = np.abs(y_pred - y_true); den = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(num, den, out=np.zeros_like(num, dtype=float), where=den!=0)) * 100
v16_smape = smape(y_val, val_preds)

print("\n--- Model Performance Comparison ---")
print(f"V8 Model SMAPE (previous champion): 50.9553")
print(f"V16 Model SMAPE (comprehensive FE): {v16_smape:.4f}")

# --- 6. Error Analysis of V16 ---
print("\n--- Top 50 Worst Predictions from V16 Model ---")
analysis_df = X_val.copy()
analysis_df['price'] = y_val
analysis_df['predicted_price'] = val_preds
analysis_df['smape_error'] = smape(analysis_df['price'], analysis_df['predicted_price'])
worst_predictions = analysis_df.sort_values(by='smape_error', ascending=False)
pd.set_option('display.max_colwidth', 200)
print(worst_predictions[['price', 'predicted_price', 'smape_error', 'catalog_content']].head(50))

# --- 7. Process Test Data and Generate Submission ---
print("\nLoading and processing test data...")
test_df = pd.read_csv('input/test.csv')
test_df['catalog_content'] = test_df['catalog_content'].astype(str).fillna('')
test_df['title'] = test_df['catalog_content'].apply(lambda t: (m.group(1).strip() if (m := re.search(r'^item name:\s*(.*)', t, re.I | re.M)) else t))

# Apply same feature engineering to test
test_df['quantity'] = test_df['catalog_content'].apply(extract_quantity)
numerical_features_test_df = pd.json_normalize(test_df['catalog_content'].apply(extract_all_numerical_features))
test_df = pd.concat([test_df.reset_index(drop=True), numerical_features_test_df], axis=1)
test_df['premium_keyword_count'] = test_df['catalog_content'].apply(lambda t: sum(k in t.lower() for k in PREMIUM_KEYWORDS))
test_df['condition_flag'] = test_df['catalog_content'].apply(lambda t: 1 if any(k in t.lower() for k in CONDITION_KEYWORDS) else 0)
test_df['title_length'] = test_df['title'].str.len().fillna(0)
test_df['content_word_count'] = test_df['catalog_content'].str.split().str.len().fillna(0)

# Ensure same numerical columns as in train
for col in numerical_cols:
    if col not in test_df.columns:
        test_df[col] = np.nan

X_test = test_df[['catalog_content'] + all_engineered_cols]

print("\nPredicting on test data...")
test_preds_log = pipeline_v16.predict(X_test)
test_preds = np.expm1(test_preds_log)
test_preds[test_preds < 0] = 0

submission = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': test_preds})
submission.to_csv('output/submission_v16.csv', index=False)
print("\nSubmission file created at: output/submission_v16.csv")

Executing V16 Feature Engineering...
All new features created.

Training V16 model...

Evaluating V16 model...

--- Model Performance Comparison ---
V8 Model SMAPE (previous champion): 50.9553
V16 Model SMAPE (comprehensive FE): 50.4310

--- Top 50 Worst Predictions from V16 Model ---
         price  predicted_price  smape_error  \
26837   12.195        13.286542    50.430959   
56393   13.275         9.916119    50.430959   
61958    6.080         5.064664    50.430959   
61231   86.990        18.259314    50.430959   
6310     6.240         7.038325    50.430959   
10118    3.980        11.228441    50.430959   
14187   47.990        41.420492    50.430959   
22719   13.805        13.830897    50.430959   
69276   41.500        16.996625    50.430959   
38456   14.220        13.631195    50.430959   
26818   21.000        20.903630    50.430959   
51820  295.000        72.387183    50.430959   
20397   35.995        17.343791    50.430959   
55086    9.990        10.443947    50.4309

: 