##  Basic Library imports

In [1]:
import os
import pandas as pd 
import numpy as np

##  Read Dataset

In [5]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train_ready.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

In [3]:
from utils import download_images
download_images(sample_test['image_link'], '../images')


100%|██████████| 100/100 [00:56<00:00,  1.77it/s]


In [4]:
assert len(os.listdir('../images')) > 0

In [10]:
train.isnull().sum()

sample_id        0
brand            0
quantity         0
pack_count       0
unit_quantity    0
category         0
price            0
dtype: int64

In [9]:
median_quantity = train['quantity'].median()
train['quantity'].fillna(median_quantity, inplace=True)

# Recompute unit_quantity
train['unit_quantity'] = train['quantity'] / train['pack_count']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['quantity'].fillna(median_quantity, inplace=True)


In [9]:
test.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...


In [10]:
sample_test.head()

Unnamed: 0,sample_id,catalog_content,image_link
0,217392,Item Name: Gift Basket Village Gourmet Meat an...,https://m.media-amazon.com/images/I/91GB1wC6Ob...
1,209156,"Item Name: NPG Dried Lotus Seeds 16 Oz, Uncook...",https://m.media-amazon.com/images/I/81VnzF1vkv...
2,262333,Item Name: Annies Homegrown Macaroni and Chees...,https://m.media-amazon.com/images/I/51aCDMHMnI...
3,295979,Item Name: Bear Creek Country Kitchens Creamy ...,https://m.media-amazon.com/images/I/71dzRyLGPi...
4,50604,Item Name: Japanese Kelp Kombu Umami Soup Stoc...,https://m.media-amazon.com/images/I/71Yu21cGwr...


In [13]:
sample_test_out.head()

Unnamed: 0,sample_id,price
0,217392,62.080008
1,209156,17.189763
2,262333,96.50141
3,295979,5.652474
4,50604,23.79478


In [11]:
from sklearn.model_selection import train_test_split

features = ['brand', 'quantity', 'pack_count', 'unit_quantity', 'category']
target = 'price'

X = train[features]
y = train[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
!pip install lightgbm optuna

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 12.4 MB/s eta 0:00:00
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
Downloading alembic-1.16.5-py3-none-any.whl (247 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading mako-1.3.10-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, lightgbm, alembic, optuna

   -------------------------------------

In [20]:
import lightgbm as lgb
import numpy as np

def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred))/2))

def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'mae',
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True)
    }
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val)
    
    model = lgb.train(
        param,
        lgb_train,
        valid_sets=[lgb_val],
        num_boost_round=1000,
        early_stopping_rounds=50,
        callbacks=[lgb.log_evaluation(0)]  # Silence LightGBM logs
    )
    
    y_pred = model.predict(X_val)
    return smape(y_val, y_pred)


In [33]:
import lightgbm as lgb
import pandas as pd

# Use the full training data
lgb_train_full = lgb.Dataset(X, label=y)  # combine X_train + X_val if you split earlier

# Best parameters from Optuna
best_params = {
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 127,
    'max_depth': 14,
    'learning_rate': 0.04379352982896006,
    'min_data_in_leaf': 23,
    'feature_fraction': 0.8923632231507306,
    'bagging_fraction': 0.9615483834427555,
    'bagging_freq': 5,
    'lambda_l1': 4.056198856745189e-08,
    'lambda_l2': 0.018524857140386312
}

# Callbacks for early stopping (optional if using validation set)
callbacks = [
    lgb.log_evaluation(period=0)  # silence logs
]

# Train model on full data
final_model = lgb.train(
    best_params,
    lgb_train_full,
    num_boost_round=1000,
    callbacks=callbacks
)

import joblib
joblib.dump(final_model, os.path.join(DATASET_FOLDER, 'lgbm_model.pkl'))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 858
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 5
[LightGBM] [Info] Start training from score 23.647654


['../dataset/lgbm_model.pkl']

In [30]:
X_test_sample = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_ready.csv'))
X_test_sample.fillna(median_quantity, inplace=True)
X_test_sample.isnull().sum()

sample_id        0
brand            0
quantity         0
pack_count       0
unit_quantity    0
category         0
dtype: int64

In [32]:

# ---- Load sample test data ----

X_test_sample = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_ready.csv'))
y_test_sample = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

# ---- Align by sample_id ----
X_test_sample = X_test_sample.sort_values('sample_id').reset_index(drop=True)
y_test_sample = y_test_sample.sort_values('sample_id').reset_index(drop=True)

# ---- Extract features (drop sample_id) ----
X_features = X_test_sample.drop(columns=['sample_id'])

# ---- Predict using your trained model ----
# Replace 'model' with your LightGBM trained model variable
y_pred = final_model.predict(X_features)

# ---- SMAPE function ----
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

# ---- Calculate SMAPE ----
y_true = y_test_sample['price'].values
score = smape(y_true, y_pred)

print("SMAPE on sample test:", score)


SMAPE on sample test: 113.543947593682
