In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. Load Data & Base Feature Functions ---
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')

def extract_quantity(text):
    text = text.lower()
    patterns = [r'pack of (\d+)', r'(\d+)\s*[-]?pack', r'(\d+)\s*pk', r'(\d+)\s*per case', r'case of (\d+)', r'(\d+)\s*count']
    for pattern in patterns:
        match = re.search(pattern, text)
        if match: return int(match.group(1))
    return 1

# --- 2. NEW: Systematic Numerical Feature Extraction ---
def extract_numerical_features(text):
    text = text.lower()
    features = {}
    # Define units and their corresponding feature names
    unit_map = {
        'gb': ['gb', 'gigabytes'],
        'oz': ['oz', 'ounce', 'ounces'],
        'inch': ['inch', 'inches', '"'],
        'mp': ['mp', 'megapixel'],
        'mah': ['mah'],
        'hz': ['hz', 'hertz'],
        'watts': ['watts', 'w'],
        'lbs': ['lb', 'lbs', 'pound', 'pounds']
    }
    
    for feature_name, units in unit_map.items():
        # Create a regex pattern for this unit group: e.g., (\d+\.?\d*)\s*(gb|gigabytes)
        pattern = re.compile(fr'(\d+\.?\d*)\s*(?:{"|".join(units)})')
        match = pattern.search(text)
        if match:
            features[f'feat_{feature_name}'] = float(match.group(1))
    return features

# Apply the function and create new columns
print("Extracting numerical features...")
numerical_features_df = pd.json_normalize(train_df['catalog_content'].apply(extract_numerical_features))
train_df = pd.concat([train_df.reset_index(drop=True), numerical_features_df], axis=1)
train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity)
print("Numerical features created.")

# --- 3. Create Hold-Out Set ---
numerical_cols = [col for col in train_df.columns if col.startswith('feat_')]
feature_cols = ['catalog_content', 'quantity'] + numerical_cols
X = train_df[feature_cols]
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 4. Build the Hybrid V8 Pipeline ---
best_params = { 'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188, 'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06, 'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5, 'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1 }

# Define a pipeline for numerical features: impute missing values then scale
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)), 'catalog_content'),
        ('numeric', numeric_pipeline, ['quantity'] + numerical_cols)
    ],
    remainder='drop'
)

pipeline_v8 = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', lgb.LGBMRegressor(**best_params))])

# --- 5. Train and Evaluate the V8 Model ---
print("\nTraining V8 model (V5 + numerical features)...")
pipeline_v8.fit(X_train, y_train_log)

print("\nEvaluating V8 model...")
val_preds_log = pipeline_v8.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v8_smape = smape(y_val, val_preds)

print("\n--- Model Performance Comparison ---")
print(f"V5 Model SMAPE (text + quantity): 51.5754")
print(f"V8 Model SMAPE (text + quantity + numericals): {v8_smape:.4f}")

Extracting numerical features...
Numerical features created.

Training V8 model (V5 + numerical features)...

Evaluating V8 model...

--- Model Performance Comparison ---
V5 Model SMAPE (text + quantity): 51.5754
V8 Model SMAPE (text + quantity + numericals): 50.9553
