In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
import lightgbm as lgb
import re
import os
from datetime import datetime
import itertools  # For simple grid

# Define SMAPE (symmetric)
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature extraction functions (same as before)
def extract_quantity(text):
    match = re.search(r'(?:pack|box|set|bundle|case) of (\d+)', text, re.I)  # Added 'case' for more bulk patterns
    return int(match.group(1)) if match else 1

def extract_numeric(text, pattern):
    match = re.search(pattern, text, re.I)
    return float(match.group(1)) if match else 0

units = {
    'gb': r'(\d+\.?\d*)\s*gb',
    'oz': r'(\d+\.?\d*)\s*oz',
    'inch': r'(\d+\.?\d*)\s*(?:inch|in(?:ch)?)',
    'mp': r'(\d+\.?\d*)\s*mp',
    'lbs': r'(\d+\.?\d*)\s*lbs?',
    'mah': r'(\d+\.?\d*)\s*mah',
    'watts': r'(\d+\.?\d*)\s*w(?:atts?)?'
    # Expand as needed
}

def extract_features(row):
    text = row['catalog_content'].lower()
    feats = {'quantity': extract_quantity(text)}
    for unit, pattern in units.items():
        feats[f'feat_{unit}'] = extract_numeric(text, pattern)
    premiums = ['premium', 'luxury', 'high-end', 'pro', 'ultra', 'elite', 'deluxe', 'professional']
    feats['premium_keyword_count'] = sum(text.count(word) for word in premiums)
    if re.search(r'\bnew\b|\bmint\b|\bbrand new\b', text):
        feats['condition_flag'] = 1
    elif re.search(r'\bused\b|\brefurbished\b|\bpre-owned\b', text):
        feats['condition_flag'] = 0
    else:
        feats['condition_flag'] = 0.5
    title = re.split(r'[.:]\s', text)[0]
    feats['title_length'] = len(title)
    feats['content_word_count'] = len(text.split())
    return pd.Series(feats)

# Load and engineer data
train = pd.read_csv('input/train.csv')
engineered = train.apply(extract_features, axis=1)
train = pd.concat([train, engineered], axis=1)

# Split
X = train.drop('price', axis=1)
y = train['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
text_transformer = TfidfVectorizer(ngram_range=(1, 2), max_features=40000)
num_cols = ['quantity'] + [f'feat_{u}' for u in units] + ['premium_keyword_count', 'condition_flag', 'title_length', 'content_word_count']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'catalog_content'),
        ('num', num_transformer, num_cols)
    ])

X_train_pre = preprocessor.fit_transform(X_train)
X_valid_pre = preprocessor.transform(X_valid)

# Sample weights
high_threshold = 100
low_threshold = 10
weights = np.ones_like(y_train)
weights[y_train > high_threshold] = 3
weights[y_train < low_threshold] = 2
dtrain = lgb.Dataset(X_train_pre, label=y_train, weight=weights)
dvalid = lgb.Dataset(X_valid_pre, label=y_valid, reference=dtrain)

# Simple grid search for key params (variance_power, learning_rate; fixed others)
grid = {
    'tweedie_variance_power': [1.2, 1.5, 1.8],
    'learning_rate': [0.02, 0.05, 0.08]
}
best_smape = np.inf
best_params = {}
for params in itertools.product(*grid.values()):
    params_dict = dict(zip(grid.keys(), params))
    params_dict.update({
        'objective': 'tweedie',
        'metric': 'tweedie',
        'num_leaves': 31,  # Fixed from your prior
        'min_data_in_leaf': 20,
        'feature_pre_filter': False,  # Added to fix the error
        'verbose': -1
    })
    model = lgb.train(params_dict, dtrain, num_boost_round=1000, valid_sets=[dvalid], callbacks=[lgb.early_stopping(50)])
    pred = np.maximum(model.predict(X_valid_pre), 1e-6)
    current_smape = smape(y_valid, pred)
    if current_smape < best_smape:
        best_smape = current_smape
        best_params = params_dict

print(f'Best params from grid: {best_params}')
print(f'Best grid SMAPE: {best_smape}')

# Train final with best
model = lgb.train(best_params, dtrain, num_boost_round=1000, valid_sets=[dvalid], callbacks=[lgb.early_stopping(50)])

# Predict and evaluate (same as before)
pred = np.maximum(model.predict(X_valid_pre), 1e-6)
smape_score = smape(y_valid, pred)
print(f'T2 Validation SMAPE: {smape_score}')

# Error Analysis (same as before)
errors = pd.DataFrame({
    'actual': y_valid,
    'pred': pred,
    'diff': pred - y_valid,
    'ape': 2 * 100 * np.abs(pred - y_valid) / (np.abs(y_valid) + np.abs(pred))
})
bins = [0, 10, 50, 100, 500, np.inf]
labels = ['0-10 (Low/Bulk)', '10-50', '50-100', '100-500 (High)', '500+ (Extreme)']
errors['price_bin'] = pd.cut(errors['actual'], bins=bins, labels=labels)
bin_smape = errors.groupby('price_bin')['ape'].mean() / 2
print('SMAPE per bin:\n', bin_smape)
bin_bias = errors.groupby('price_bin')['diff'].mean()
print('Bias (pred - actual) per bin:\n', bin_bias)
bin_var = errors.groupby('price_bin')['diff'].var()
print('Error variance per bin:\n', bin_var)
top_errors = errors.sort_values('ape', ascending=False).head(10)
print('Top 10 worst predictions:\n', top_errors)
if bin_bias.iloc[-2] < 0:
    print('Insight: Still underpredicting highs—try higher tail weights or lower variance_power (e.g., 1.2 for more Poisson-like).')
if bin_smape.iloc[0] > bin_smape.mean():
    print('Insight: Bulk/low errors high—refine quantity regex or add bulk-specific features (e.g., "case of").')

# Save outputs (same as before)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
out_dir = f'output/T2_{timestamp}'
os.makedirs(out_dir, exist_ok=True)
pd.DataFrame({'actual': y_valid, 'pred': pred}).to_csv(os.path.join(out_dir, 'preds_valid.csv'), index=False)
model.save_model(os.path.join(out_dir, 'model.txt'))
with open(os.path.join(out_dir, 'error_analysis.txt'), 'w') as f:
    f.write(f'Validation SMAPE: {smape_score}\n')
    f.write('SMAPE per bin:\n' + str(bin_smape) + '\n')
    f.write('Bias per bin:\n' + str(bin_bias) + '\n')
    f.write('Error variance per bin:\n' + str(bin_var) + '\n')
    f.write('Top 10 worst:\n' + str(top_errors) + '\n')
    f.write(f'Best params: {best_params}\n')

# Inference on test
test = pd.read_csv('input/test.csv')
engineered_test = test.apply(extract_features, axis=1)
test = pd.concat([test, engineered_test], axis=1)
X_test_pre = preprocessor.transform(test)
pred_test = np.maximum(model.predict(X_test_pre), 1e-6)
if 'sample_id' in test.columns:
    submission = pd.DataFrame({'sample_id': test['sample_id'], 'price': pred_test})
    submission.to_csv(os.path.join(out_dir, 'submission.csv'), index=False)
else:
    pd.DataFrame({'pred': pred_test}).to_csv(os.path.join(out_dir, 'preds_test.csv'), index=False)

print(f'Outputs saved to {out_dir}')

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's tweedie: 75.8555
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's tweedie: 75.7234
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[768]	valid_0's tweedie: 75.7508
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's tweedie: 18.3778
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[994]	valid_0's tweedie: 18.349
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[674]	valid_0's tweedie: 18.3489
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's tweedie: 11.3608
Training until validation scor

  bin_smape = errors.groupby('price_bin')['ape'].mean() / 2
  bin_bias = errors.groupby('price_bin')['diff'].mean()
  bin_var = errors.groupby('price_bin')['diff'].var()


Outputs saved to output/T2_20251013_145619
