In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack, coo_matrix

warnings.filterwarnings('ignore')

# Custom SMAPE metric (same as before)
def smape(y_true, y_pred):
    y_true_unlogged = np.expm1(y_true)
    y_pred_unlogged = np.expm1(y_pred)
    numerator = np.abs(y_pred_unlogged - y_true_unlogged)
    denominator = (np.abs(y_true_unlogged) + np.abs(y_pred_unlogged)) / 2
    smape_val = np.mean(numerator / (denominator + 1e-8)) * 100
    return 'smape', smape_val, False

print("Libraries imported.")

Libraries imported.


In [2]:
# Define paths
PROCESSED_DATA_FOLDER = '../data/processed'
TRAIN_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'train_processed.parquet')
TEST_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'test_processed.parquet')

# Load the data
train_df = pd.read_parquet(TRAIN_FILE_PATH)
test_df = pd.read_parquet(TEST_FILE_PATH)

# Make sure catalog_content is a string
train_df['catalog_content'] = train_df['catalog_content'].astype(str)
test_df['catalog_content'] = test_df['catalog_content'].astype(str)

print(f"Loaded train data: {train_df.shape}")
print(f"Loaded test data: {test_df.shape}")

Loaded train data: (75000, 48)
Loaded test data: (75000, 46)


In [3]:
print("Extracting brand features...")

# This regex finds 'Item Name:' and captures the first word.
# We also handle brand names like "Judee's" or "Mrs. Dash"
brand_regex = re.compile(r'Item Name:\s*([\w\’\'\-\.&]+)')

def extract_brand(text_series):
    brands = text_series.str.extract(brand_regex, expand=False)
    # Fill in any missing brands with 'Unknown'
    brands = brands.fillna('Unknown')
    # Convert to lowercase
    return brands.str.lower()

# --- Apply to both train and test data ---
train_df['brand'] = extract_brand(train_df['catalog_content'])
test_df['brand'] = extract_brand(test_df['catalog_content'])

# --- Encode the Brand Feature ---
# We have thousands of unique brand names. We need to convert them to numbers.
# LabelEncoder turns 'nike' -> 1, 'adidas' -> 2, etc.
# LightGBM is smart enough to handle this correctly.

brand_encoder = LabelEncoder()

# We need to fit the encoder on ALL brand names from train + test
all_brands = pd.concat([train_df['brand'], test_df['brand']])
brand_encoder.fit(all_brands)

# Now, transform the brand columns in train and test
train_df['brand_encoded'] = brand_encoder.transform(train_df['brand'])
test_df['brand_encoded'] = brand_encoder.transform(test_df['brand'])

print("Brand extraction and encoding complete.")
print("Example of extracted brands:")
print(train_df[['brand', 'brand_encoded']].head())

# Let's see how many unique brands we found
print(f"\nFound {len(brand_encoder.classes_)} unique brands.")

Extracting brand features...
Brand extraction and encoding complete.
Example of extracted brands:
     brand  brand_encoded
0       la           6102
1  salerno           9377
2     bear           1170
3  judee’s           5640
4    kedem           5780

Found 11940 unique brands.


In [4]:
# We'll use 80% for training, 20% for validation
X_train, X_val = train_test_split(train_df, test_size=0.2, random_state=42)

y_train = X_train['log_price']
y_val = X_val['log_price']

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Training data shape: (60000, 50)
Validation data shape: (15000, 50)


In [5]:
print("Starting TF-IDF vectorization...")
tfidf_vec = TfidfVectorizer(
    ngram_range=(1, 2), 
    max_features=50000,
    stop_words='english',
    dtype=np.float32
)

X_train_text = tfidf_vec.fit_transform(X_train['catalog_content'])
X_val_text = tfidf_vec.transform(X_val['catalog_content'])

print("Starting TruncatedSVD...")
svd = TruncatedSVD(n_components=300, random_state=42)

X_train_svd = svd.fit_transform(X_train_text)
X_val_svd = svd.transform(X_val_text)

print(f"SVD training matrix shape: {X_train_svd.shape}")

Starting TF-IDF vectorization...
Starting TruncatedSVD...
SVD training matrix shape: (60000, 300)


In [6]:
# Get the numerical feature columns
numerical_features = [
    col for col in train_df.columns 
    if col.startswith('unit_') or col in ['pack_size', 'total_measure']
]
print(f"Found {len(numerical_features)} numerical features.")

# Get the numerical data
X_train_num = X_train[numerical_features].values
X_val_num = X_val[numerical_features].values

# Get the NEW brand data
X_train_brand = X_train[['brand_encoded']].values
X_val_brand = X_val[['brand_encoded']].values

# --- Combine features ---
# We use np.hstack to combine all three groups:
# [ (SVD "topic" features) , (numerical features) , (brand feature) ]
print("Combining SVD, numerical, and brand features...")
X_train_final = np.hstack((X_train_svd, X_train_num, X_train_brand))
X_val_final = np.hstack((X_val_svd, X_val_num, X_val_brand))

print(f"Final training feature matrix shape: {X_train_final.shape}")
print(f"Final validation feature matrix shape: {X_val_final.shape}")

Found 44 numerical features.
Combining SVD, numerical, and brand features...
Final training feature matrix shape: (60000, 345)
Final validation feature matrix shape: (15000, 345)


In [8]:
print("Training LightGBM model with BRAND feature...")

# Find the column index for our categorical feature.
# It's the very last column in our new matrix.
categorical_feature_index = X_train_final.shape[1] - 1

lgbm_model_brand = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
    metric='None'
)

# Train the model
lgbm_model_brand.fit(
    X_train_final, 
    y_train,
    eval_set=[(X_val_final, y_val)],
    eval_metric=smape,
    # Tell the model which column is categorical
    categorical_feature=[categorical_feature_index],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

print("\nModel training complete.")
print(f"Best SMAPE Score: {lgbm_model_brand.best_score_['valid_0']['smape']}")

Training LightGBM model with BRAND feature...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.126405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 80176
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 314
[LightGBM] [Info] Start training from score 2.740904
Training until validation scores don't improve for 100 rounds
[100]	valid_0's smape: 55.674
[200]	valid_0's smape: 54.0962
[300]	valid_0's smape: 53.4456
[400]	valid_0's smape: 53.0875
[500]	valid_0's smape: 52.8052
[600]	valid_0's smape: 52.5679
[700]	valid_0's smape: 52.3657
[800]	valid_0's smape: 52.2224
[900]	valid_0's smape: 52.0821
[1000]	valid_0's smape: 52.0135
[1100]	valid_0's smape: 51.9367
[1200]	valid_0's smape: 51.8579
[1300]	valid_0's smape: 51.7947
[1400]	valid_0's smape: 51.7576
[1500]	valid_0's smape: 51.7015
[1600]	valid_0's smape: 51.6523
[1700]	valid_0's smape: 51.6064
[1800]	valid_0