In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, coo_matrix
import warnings

warnings.filterwarnings('ignore')

# custom SMAPE metric for validation
def smape(y_true, y_pred):
    # converting log_price back
    y_true_unlogged = np.expm1(y_true)
    y_pred_unlogged = np.expm1(y_pred)

    # calculate SMAPE
    numerator = np.abs(y_pred_unlogged - y_true_unlogged)
    denominator = (np.abs(y_true_unlogged) + np.abs(y_pred_unlogged)) / 2

    #avoiding division by zero
    smape_val = np.mean(numerator / (denominator + 1e-8)) * 100

    return 'smape', smape_val, False

print("Libraries imported and SMAPE function defined.")

Libraries imported and SMAPE function defined.


In [2]:
# defining paths
PROCESSED_DATA_FOLDER = '../data/processed'
TRAIN_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'train_processed.parquet')
TEST_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'test_processed.parquet')

# loading data
train_df = pd.read_parquet(TRAIN_FILE_PATH)
test_df = pd.read_parquet(TEST_FILE_PATH)

train_df['catalog_content'] = train_df['catalog_content'].astype(str)
test_df['catalog_content'] = test_df['catalog_content'].astype(str)

print(f"Loaded train data: {train_df.shape}")
print(f"Loaded test data: {test_df.shape}")
train_df.head()

Loaded train data: (75000, 48)
Loaded test data: (75000, 46)


Unnamed: 0,sample_id,catalog_content,price,pack_size,total_measure,unit_,unit_1,unit_2,unit_bag,unit_bottle,...,unit_paper cupcake liners,unit_per package,unit_piece,unit_pouch,unit_pound,unit_pounds,unit_product_weight,unit_sq ft,unit_tea bags,log_price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",4.89,6.0,72.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1.773256
1,198967,"Item Name: Salerno Cookies, The Original Butte...",13.12,4.0,32.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,2.647592
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",1.97,6.0,11.4,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1.088562
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,30.34,1.0,11.25,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,3.444895
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",66.49,1.0,12.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,4.211979


In [3]:
# 80% for training and 20% for validation
X_train, X_val = train_test_split(train_df, test_size=0.2, random_state=42)

y_train = X_train['log_price']
y_val = X_val['log_price']

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Training data shape: (60000, 48)
Validation data shape: (15000, 48)


In [4]:
print("Starting TF-IDF vectorization...")

# TF-IDF Vectorization
tfidf_vec = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=20000,
    stop_words='english',
    dtype=np.float32
)

X_train_text = tfidf_vec.fit_transform(X_train['catalog_content'])

X_val_text = tfidf_vec.transform(X_val['catalog_content'])

print(f"TF-IDF training matrix shape: {X_train_text.shape}")
print(f"TF-IDF validation matrix shape: {X_val_text.shape}")

Starting TF-IDF vectorization...
TF-IDF training matrix shape: (60000, 20000)
TF-IDF validation matrix shape: (15000, 20000)


In [7]:
# getting list of all numerical feature columns we created
numerical_features = [
    col for col in train_df.columns
    if col.startswith('unit_') or col in ['pack_size', 'total_measure']
]

print(f"Found {len(numerical_features)} numerical features.")
print(numerical_features)

X_train_num = X_train[numerical_features]
X_val_num = X_val[numerical_features]

X_train_num = X_train_num.apply(pd.to_numeric, errors='coerce').fillna(0)
X_val_num = X_val_num.apply(pd.to_numeric, errors='coerce').fillna(0)

print("Dtypes after conversion:")
print(X_train_num.dtypes.value_counts())

# --- Combine features ---
# We use hstack (horizontal stack) to combine:
# [ (sparse TF-IDF matrix) , (dense numerical matrix) ]
# We use coo_matrix to make the dense numerical part compatible
print("Combining text and numerical features...")
X_train_final = hstack([X_train_text, coo_matrix(X_train_num.values.astype(np.float64))])
X_val_final = hstack([X_val_text, coo_matrix(X_val_num.values.astype(np.float64))])

print(f"Final training feature matrix shape: {X_train_final.shape}")
print(f"Final validation feature matrix shape: {X_val_final.shape}")

Found 44 numerical features.
['pack_size', 'total_measure', 'unit_', 'unit_1', 'unit_2', 'unit_bag', 'unit_bottle', 'unit_box', 'unit_count', 'unit_ct', 'unit_each', 'unit_fl', 'unit_fl ounce', 'unit_fl oz', 'unit_fluid ounce', 'unit_fluid ounces', 'unit_foot', 'unit_gram', 'unit_grams', 'unit_jar', 'unit_k', 'unit_kg', 'unit_lb', 'unit_liters', 'unit_ltr', 'unit_mililitro', 'unit_milliliter', 'unit_millilitre', 'unit_ml', 'unit_none', 'unit_ounce', 'unit_ounces', 'unit_oz', 'unit_pack', 'unit_packs', 'unit_paper cupcake liners', 'unit_per package', 'unit_piece', 'unit_pouch', 'unit_pound', 'unit_pounds', 'unit_product_weight', 'unit_sq ft', 'unit_tea bags']
Dtypes after conversion:
bool       42
float64     2
Name: count, dtype: int64
Combining text and numerical features...
Final training feature matrix shape: (60000, 20044)
Final validation feature matrix shape: (15000, 20044)


In [9]:
print("Training LightGBM model...")

# lightGBM regressor
lgbm_model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
    metric='None'
)

lgbm_model.fit(
    X_train_final,
    y_train,
    eval_set=[(X_val_final, y_val)],
    eval_metric=smape,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

print("\nModel training complete.")
print(f"Best SMAPE Score: {lgbm_model.best_score_['valid_0']['smape']}")

Training LightGBM model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.892144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1094040
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 19577
[LightGBM] [Info] Start training from score 2.740904
Training until validation scores don't improve for 100 rounds
[100]	valid_0's smape: 57.6366
[200]	valid_0's smape: 55.6166
[300]	valid_0's smape: 54.6231
[400]	valid_0's smape: 53.9878
[500]	valid_0's smape: 53.5215
[600]	valid_0's smape: 53.1865
[700]	valid_0's smape: 52.9186
[800]	valid_0's smape: 52.7058
[900]	valid_0's smape: 52.4932
[1000]	valid_0's smape: 52.3002
[1100]	valid_0's smape: 52.1864
[1200]	valid_0's smape: 52.0782
[1300]	valid_0's smape: 51.9596
[1400]	valid_0's smape: 51.8765
[1500]	valid_0's smape: 51.7844
[1600]	valid_0's smape: 51.7228
[1700]	valid_0's smape: 51.6653
[1800]	valid_0's smape: 51.6

In [10]:
print("Creating submission file...")

# 1. Process the test_df text data
test_text = tfidf_vec.transform(test_df['catalog_content'])

# 2. Process the test_df numerical data
test_num = test_df[numerical_features]

# 3. Combine test features
X_test_final = hstack([test_text, coo_matrix(test_num.values.astype(np.float64))])

print(f"Final test feature matrix shape: {X_test_final.shape}")

# 4. Make predictions
log_price_preds = lgbm_model.predict(X_test_final)

# 5. convert back to price
price_preds = np.expm1(log_price_preds)

# 6. making sure prices are non-negative
price_preds[price_preds < 0] = 0

# 7. Create submission DataFrame
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': price_preds
})

# 8. Save submission file
SUBMISSION_FILE_PATH = '../submissions/submission_baseline.csv'
submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)

print(f"Submission file created at: {SUBMISSION_FILE_PATH}")
submission_df.head()

Creating submission file...
Final test feature matrix shape: (75000, 20044)
Submission file created at: ../submissions/submission_baseline.csv


Unnamed: 0,sample_id,price
0,100179,15.815097
1,245611,17.545864
2,146263,13.338734
3,95658,10.944428
4,36806,34.827407
