In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack, coo_matrix

warnings.filterwarnings('ignore')

# Custom SMAPE metric
def smape(y_true, y_pred):
    y_true_unlogged = np.expm1(y_true)
    y_pred_unlogged = np.expm1(y_pred)
    numerator = np.abs(y_pred_unlogged - y_true_unlogged)
    denominator = (np.abs(y_true_unlogged) + np.abs(y_pred_unlogged)) / 2
    smape_val = np.mean(numerator / (denominator + 1e-8)) * 100
    return 'smape', smape_val, False

print("Libraries imported.")

Libraries imported.


In [2]:
# Define paths
PROCESSED_DATA_FOLDER = '../data/processed'
TRAIN_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'train_processed.parquet')
TEST_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'test_processed.parquet')

# Load the data
train_df = pd.read_parquet(TRAIN_FILE_PATH)
test_df = pd.read_parquet(TEST_FILE_PATH)

# Make sure catalog_content is a string
train_df['catalog_content'] = train_df['catalog_content'].astype(str)
test_df['catalog_content'] = test_df['catalog_content'].astype(str)

print(f"Loaded train data: {train_df.shape}")
print(f"Loaded test data: {test_df.shape}")

Loaded train data: (75000, 48)
Loaded test data: (75000, 46)


In [3]:
print("Extracting brand features...")

brand_regex = re.compile(r'Item Name:\s*([\w\â€™\'\-\.&]+)')

def extract_brand(text_series):
    brands = text_series.str.extract(brand_regex, expand=False)
    brands = brands.fillna('Unknown')
    return brands.str.lower()

# --- Apply to both train and test data ---
train_df['brand'] = extract_brand(train_df['catalog_content'])
test_df['brand'] = extract_brand(test_df['catalog_content'])

# --- Encode the Brand Feature ---
brand_encoder = LabelEncoder()
all_brands = pd.concat([train_df['brand'], test_df['brand']])
brand_encoder.fit(all_brands)

train_df['brand_encoded'] = brand_encoder.transform(train_df['brand'])
test_df['brand_encoded'] = brand_encoder.transform(test_df['brand'])

print("Brand extraction and encoding complete.")
print(f"Found {len(brand_encoder.classes_)} unique brands.")

Extracting brand features...
Brand extraction and encoding complete.
Found 11940 unique brands.


In [4]:
# We'll use 80% for training, 20% for validation
X_train, X_val = train_test_split(train_df, test_size=0.2, random_state=42)

y_train = X_train['log_price']
y_val = X_val['log_price']

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Training data shape: (60000, 50)
Validation data shape: (15000, 50)


In [5]:
print("Starting TF-IDF vectorization...")

# Initialize the TF-IDF Vectorizer
# We'll stick with the original 20,000 features
tfidf_vec = TfidfVectorizer(
    ngram_range=(1, 2), 
    max_features=20000, 
    stop_words='english',
    dtype=np.float32
)

# 1. FIT and TRANSFORM the training text
X_train_text = tfidf_vec.fit_transform(X_train['catalog_content'])

# 2. TRANSFORM the validation text
X_val_text = tfidf_vec.transform(X_val['catalog_content'])

print(f"TF-IDF training matrix shape: {X_train_text.shape}")
print(f"TF-IDF validation matrix shape: {X_val_text.shape}")

Starting TF-IDF vectorization...
TF-IDF training matrix shape: (60000, 20000)
TF-IDF validation matrix shape: (15000, 20000)


In [8]:
# Get the numerical feature columns
numerical_features = [
    col for col in train_df.columns 
    if col.startswith('unit_') or col in ['pack_size', 'total_measure']
]
print(f"Found {len(numerical_features)} numerical features.")

# Get the numerical data
X_train_num = X_train[numerical_features].values.astype(np.float32)
X_val_num = X_val[numerical_features].values.astype(np.float32)

# Get the NEW brand data
X_train_brand = X_train[['brand_encoded']].values.astype(np.float32)
X_val_brand = X_val[['brand_encoded']].values.astype(np.float32)

# --- Combine features ---
# We use scipy's hstack to combine:
# [ (sparse TF-IDF) , (dense numerics) , (dense brand) ]
# It efficiently handles the mix of sparse and dense.
print("Combining TF-IDF, numerical, and brand features...")
X_train_final = hstack((
    X_train_text, 
    coo_matrix(X_train_num), 
    coo_matrix(X_train_brand)
))

X_val_final = hstack((
    X_val_text, 
    coo_matrix(X_val_num), 
    coo_matrix(X_val_brand)
))

# The shape should be (rows, 20000 + num_features + 1)
print(f"Final training feature matrix shape: {X_train_final.shape}")
print(f"Final validation feature matrix shape: {X_val_final.shape}")

Found 44 numerical features.
Combining TF-IDF, numerical, and brand features...
Final training feature matrix shape: (60000, 20045)
Final validation feature matrix shape: (15000, 20045)


In [9]:
print("Training LightGBM model with TF-IDF + BRAND features...")

total_cols = 20000 + len(numerical_features) + 1
categorical_feature_index = total_cols - 1  # Last column is brand_encoded

categorical_feature_index = X_train_final.shape[1] - 1  # Last column is brand_encoded

print(f"Categorical feature index (brand): {categorical_feature_index}")

lgbm_model_final = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
    metric='None'  # We'll use custom SMAPE
)

lgbm_model_final.fit(
    X_train_final,
    y_train,
    eval_set=[(X_val_final, y_val)],
    eval_metric=smape,
    categorical_feature=[categorical_feature_index],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

print("\nModel training complete.")
print(f"Best SMAPE Score: {lgbm_model_final.best_score_['valid_0']['smape']}")

Training LightGBM model with TF-IDF + BRAND features...
Categorical feature index (brand): 20044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 3.276701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1097364
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 19578
[LightGBM] [Info] Start training from score 2.740904
Training until validation scores don't improve for 100 rounds
[100]	valid_0's smape: 54.4345
[200]	valid_0's smape: 52.9315
[300]	valid_0's smape: 52.3579
[400]	valid_0's smape: 51.9415
[500]	valid_0's smape: 51.7018
[600]	valid_0's smape: 51.5045
[700]	valid_0's smape: 51.3683
[800]	valid_0's smape: 51.2235
[900]	valid_0's smape: 51.0936
[1000]	valid_0's smape: 50.9884
[1100]	valid_0's smape: 50.8877
[1200]	valid_0's smape: 50.7801
[1300]	valid_0's smape: 50.6973
[1400]	valid_0's smape