In [11]:
#!pip install sentence-transformers lightgbm torchvision optuna tqdm --quiet



import pandas as pd

import numpy as np

import gc

import re, os, requests, torch, json

from tqdm import tqdm

from PIL import Image

from io import BytesIO

from sentence_transformers import SentenceTransformer

from torchvision import models, transforms

from sklearn.model_selection import train_test_split

import lightgbm as lgb

import optuna

In [12]:
import pandas as pd
KAGGLE_INPUT_PATH = '/kaggle/input/sample-dataset'
KAGGLE_WORKING_PATH = '/kaggle/working/'

try:
    train_df = pd.read_csv(os.path.join(KAGGLE_INPUT_PATH, 'Sample_train.csv'))
    test_df  = pd.read_csv(os.path.join(KAGGLE_INPUT_PATH, 'sample_test.csv'))
    print("Train shape:", train_df.shape)
    print("Test shape:", test_df.shape)
except FileNotFoundError:
    raise FileNotFoundError(f"Please ensure Sample_train.csv and Sample_test.csv are in the dataset at: {KAGGLE_INPUT_PATH}")

Train shape: (1000, 4)
Test shape: (100, 3)


In [13]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'[^a-zA-Z0-9 ]', ' ', text)
    return text.lower().strip()

def extract_ipq(text):
    if pd.isnull(text): return 1
    text = text.lower()
    patterns = [
        r'pack of (\d+)', r'(\d+)\s*count', r'set of (\d+)',
        r'(\d+)\s*pack', r'(\d+)\s*(pcs|pieces|units?)'
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return int(match.group(1))
    return 1

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

def lgb_smape(y_true, y_pred):
    return 'smape', smape(y_true, y_pred), False  # lower = better

for df in [train_df, test_df]:
    df['clean_text'] = df['catalog_content'].apply(clean_text)
    df['ipq'] = df['catalog_content'].apply(extract_ipq)

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model_text = SentenceTransformer('all-MiniLM-L6-v2')

resnet = models.resnet50(pretrained=True)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1]).eval().to(device)
img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

def extract_image_features(url):
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert('RGB')
        img_t = img_transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            features = resnet(img_t).cpu().numpy().flatten()
        return features
    except Exception:
        return np.zeros(2048)

def get_or_create_embeddings(df, text_model, image_extractor, prefix):
    # Files will be saved to the persistent /kaggle/working/ directory
    text_file = os.path.join(KAGGLE_WORKING_PATH, f"{prefix}_text_embeds.npy")
    img_file = os.path.join(KAGGLE_WORKING_PATH, f"{prefix}_img_embeds.npy")

    if os.path.exists(text_file):
        print(f"Loading {text_file}")
        text_embeds = np.load(text_file)
    else:
        print(f"Encoding text for {prefix} ...")
        text_embeds = text_model.encode(df['clean_text'].tolist(), show_progress_bar=True)
        np.save(text_file, text_embeds)

    if os.path.exists(img_file):
        print(f"Loading {img_file}")
        img_embeds = np.load(img_file)
    else:
        print(f"Extracting images for {prefix} ...")
        img_feats = [image_extractor(url) for url in tqdm(df['image_link'], desc=f"{prefix} images")]
        img_embeds = np.vstack(img_feats)
        np.save(img_file, img_embeds)
        # Clean up to save memory
        del img_feats
        gc.collect()

    return text_embeds, img_embeds

train_text_embeds, train_img_embeds = get_or_create_embeddings(train_df, model_text, extract_image_features, "train")
test_text_embeds,  test_img_embeds  = get_or_create_embeddings(test_df,  model_text, extract_image_features, "test")

Using device: cpu
Loading /kaggle/working/train_text_embeds.npy
Extracting images for train ...




train images:   0%|          | 0/1000 [00:00<?, ?it/s][A[A

train images:   0%|          | 1/1000 [00:00<04:15,  3.91it/s][A[A

train images:   0%|          | 2/1000 [00:00<04:26,  3.74it/s][A[A

train images:   0%|          | 3/1000 [00:00<04:10,  3.98it/s][A[A

train images:   0%|          | 4/1000 [00:00<03:59,  4.15it/s][A[A

train images:   0%|          | 5/1000 [00:01<04:17,  3.86it/s][A[A

train images:   1%|          | 6/1000 [00:01<04:48,  3.44it/s][A[A

train images:   1%|          | 7/1000 [00:01<04:44,  3.49it/s][A[A

train images:   1%|          | 8/1000 [00:02<04:55,  3.36it/s][A[A

train images:   1%|          | 9/1000 [00:02<04:31,  3.65it/s][A[A

train images:   1%|          | 10/1000 [00:02<04:17,  3.85it/s][A[A

train images:   1%|          | 11/1000 [00:02<04:07,  3.99it/s][A[A

train images:   1%|          | 12/1000 [00:03<04:14,  3.88it/s][A[A

train images:   1%|▏         | 13/1000 [00:03<04:05,  4.02it/s][A[A

train images:   1%|▏  

Encoding text for test ...





Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Extracting images for test ...


test images: 100%|██████████| 100/100 [00:31<00:00,  3.17it/s]


In [16]:
train_ipq = train_df['ipq'].values.reshape(-1, 1)
test_ipq  = test_df['ipq'].values.reshape(-1, 1)

X_train_all = np.hstack([train_text_embeds, train_img_embeds, train_ipq])
X_test_all  = np.hstack([test_text_embeds,  test_img_embeds,  test_ipq])
y = train_df['price'].values

print("Final Feature Shape:", X_train_all.shape)


Final Feature Shape: (1000, 2433)


In [17]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_all, y, test_size=0.15, random_state=42
)

In [18]:
optuna_dir = os.path.join(KAGGLE_WORKING_PATH, 'optuna_studies')
os.makedirs(optuna_dir, exist_ok=True)
optuna_db_path = os.path.join(optuna_dir, 'product_price_tuning.db')

storage_name = f"sqlite:///{optuna_db_path}"
study_name = "lgbm_price_prediction_mae"

try:
    study = optuna.load_study(study_name=study_name, storage=storage_name)
    print(" Loaded existing Optuna study.")
except KeyError:
    study = optuna.create_study(direction='minimize', study_name=study_name, storage=storage_name)
    print(" New Optuna study created.")

[I 2025-10-14 13:10:17,058] A new study created in RDB with name: lgbm_price_prediction_mae


 New Optuna study created.


In [29]:
def objective(trial):
    params = {
        'objective': 'mae',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 128),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 200),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'n_estimators': 3000,
        'random_state': 42,
        'n_jobs': -1
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric=lgb_smape,
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    y_pred = model.predict(X_val)
    y_pred[y_pred < 0] = 0
    print(smape(y_val, y_pred))
    return smape(y_val, y_pred)
    


In [30]:
study.optimize(objective, n_trials=25, show_progress_bar=True)

best_params_file = os.path.join(optuna_dir, 'best_params.json')
with open(best_params_file, 'w') as f:
    json.dump(study.best_trial.params, f, indent=4)
print(f" Best parameters saved to {best_params_file}")


  0%|          | 0/25 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.131592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:08<04:34,  3.11it/s][A

67.06504700017538
[I 2025-10-14 13:27:17,538] Trial 25 finished with value: 67.06504700017538 and parameters: {'learning_rate': 0.03332212166187645, 'num_leaves': 92, 'max_depth': 12, 'min_child_samples': 77, 'subsample': 0.9617294729199353, 'colsample_bytree': 0.6042685123995635}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:12<04:34,  3.11it/s][A

66.03416226321674
[I 2025-10-14 13:27:21,735] Trial 26 finished with value: 66.03416226321674 and parameters: {'learning_rate': 0.04521183128581688, 'num_leaves': 64, 'max_depth': 7, 'min_child_samples': 98, 'subsample': 0.8626122439507004, 'colsample_bytree': 0.6929202512603911}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.127107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:16<04:34,  3.11it/s][A

67.07397949616234
[I 2025-10-14 13:27:25,385] Trial 27 finished with value: 67.07397949616234 and parameters: {'learning_rate': 0.038659177207587554, 'num_leaves': 53, 'max_depth': 6, 'min_child_samples': 181, 'subsample': 0.7812362485686031, 'colsample_bytree': 0.7935687839285561}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:20<04:34,  3.11it/s][A

66.68077925381036
[I 2025-10-14 13:27:29,885] Trial 28 finished with value: 66.68077925381036 and parameters: {'learning_rate': 0.046548904973701066, 'num_leaves': 32, 'max_depth': 9, 'min_child_samples': 148, 'subsample': 0.6051382891434383, 'colsample_bytree': 0.7158648746782031}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074893 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:30<04:34,  3.11it/s][A

66.66570652079537
[I 2025-10-14 13:27:40,060] Trial 29 finished with value: 66.66570652079537 and parameters: {'learning_rate': 0.015718778537754177, 'num_leaves': 48, 'max_depth': 11, 'min_child_samples': 128, 'subsample': 0.7140919349283229, 'colsample_bytree': 0.9614167070571283}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101867 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:43<04:34,  3.11it/s][A

68.60540038902508
[I 2025-10-14 13:27:52,905] Trial 30 finished with value: 68.60540038902508 and parameters: {'learning_rate': 0.03266196133124716, 'num_leaves': 38, 'max_depth': 12, 'min_child_samples': 34, 'subsample': 0.6366615399380008, 'colsample_bytree': 0.8518110629712321}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:48<04:34,  3.11it/s][A

67.1285075720848
[I 2025-10-14 13:27:57,392] Trial 31 finished with value: 67.1285075720848 and parameters: {'learning_rate': 0.021321454351966813, 'num_leaves': 48, 'max_depth': 8, 'min_child_samples': 140, 'subsample': 0.8463316938827279, 'colsample_bytree': 0.6771206061781795}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099596 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [26:56<04:34,  3.11it/s][A

66.41456462724832
[I 2025-10-14 13:28:06,106] Trial 32 finished with value: 66.41456462724832 and parameters: {'learning_rate': 0.01563111445488214, 'num_leaves': 40, 'max_depth': 5, 'min_child_samples': 111, 'subsample': 0.782232153449224, 'colsample_bytree': 0.6297111474151564}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:04<04:34,  3.11it/s][A

67.37302553668178
[I 2025-10-14 13:28:13,430] Trial 33 finished with value: 67.37302553668178 and parameters: {'learning_rate': 0.01740630538126428, 'num_leaves': 55, 'max_depth': 7, 'min_child_samples': 71, 'subsample': 0.7472563615556056, 'colsample_bytree': 0.6062093162710137}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099303 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:12<04:34,  3.11it/s][A

66.35546820982825
[I 2025-10-14 13:28:22,094] Trial 34 finished with value: 66.35546820982825 and parameters: {'learning_rate': 0.018898396739496157, 'num_leaves': 65, 'max_depth': 6, 'min_child_samples': 130, 'subsample': 0.8349786525227344, 'colsample_bytree': 0.8109796813460549}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:21<04:34,  3.11it/s][A

65.91823512686776
[I 2025-10-14 13:28:31,198] Trial 35 finished with value: 65.91823512686776 and parameters: {'learning_rate': 0.0237372883748573, 'num_leaves': 75, 'max_depth': 8, 'min_child_samples': 89, 'subsample': 0.8877571651429312, 'colsample_bytree': 0.7433000605665018}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:28<04:34,  3.11it/s][A

67.7924194260173
[I 2025-10-14 13:28:37,849] Trial 36 finished with value: 67.7924194260173 and parameters: {'learning_rate': 0.02441918939258163, 'num_leaves': 87, 'max_depth': 9, 'min_child_samples': 94, 'subsample': 0.8881748359507222, 'colsample_bytree': 0.7743619108624491}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104891 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:40<04:34,  3.11it/s][A

68.03090278840038
[I 2025-10-14 13:28:49,576] Trial 37 finished with value: 68.03090278840038 and parameters: {'learning_rate': 0.026461881802832377, 'num_leaves': 75, 'max_depth': 8, 'min_child_samples': 41, 'subsample': 0.9607248018928308, 'colsample_bytree': 0.7514654582120193}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:47<04:34,  3.11it/s][A

67.17961626533615
[I 2025-10-14 13:28:56,445] Trial 38 finished with value: 67.17961626533615 and parameters: {'learning_rate': 0.013764875854185104, 'num_leaves': 98, 'max_depth': 5, 'min_child_samples': 84, 'subsample': 0.9382733081238267, 'colsample_bytree': 0.7294123266084646}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [27:54<04:34,  3.11it/s][A

67.1065718914968
[I 2025-10-14 13:29:03,934] Trial 39 finished with value: 67.1065718914968 and parameters: {'learning_rate': 0.023163651254035796, 'num_leaves': 75, 'max_depth': 6, 'min_child_samples': 66, 'subsample': 0.8112640673291818, 'colsample_bytree': 0.7935908509267983}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:02<04:34,  3.11it/s][A

67.57350002572586
[I 2025-10-14 13:29:12,090] Trial 40 finished with value: 67.57350002572586 and parameters: {'learning_rate': 0.01995247377452849, 'num_leaves': 80, 'max_depth': 7, 'min_child_samples': 55, 'subsample': 0.8856696567056153, 'colsample_bytree': 0.7007060489737618}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:08<04:34,  3.11it/s][A

65.92884490505591
[I 2025-10-14 13:29:17,544] Trial 41 finished with value: 65.92884490505591 and parameters: {'learning_rate': 0.030779288397175083, 'num_leaves': 60, 'max_depth': 8, 'min_child_samples': 105, 'subsample': 0.8006208531380634, 'colsample_bytree': 0.6395263795798461}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:14<04:34,  3.11it/s][A

66.23272170938768
[I 2025-10-14 13:29:23,932] Trial 42 finished with value: 66.23272170938768 and parameters: {'learning_rate': 0.030175074194897508, 'num_leaves': 70, 'max_depth': 8, 'min_child_samples': 106, 'subsample': 0.797483871284117, 'colsample_bytree': 0.6363094775438356}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.102961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:19<04:34,  3.11it/s][A

68.15917257861263
[I 2025-10-14 13:29:28,839] Trial 43 finished with value: 68.15917257861263 and parameters: {'learning_rate': 0.035904352903608126, 'num_leaves': 63, 'max_depth': 8, 'min_child_samples': 96, 'subsample': 0.847125333037803, 'colsample_bytree': 0.6621203390293051}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:25<04:34,  3.11it/s][A

66.26270183000776
[I 2025-10-14 13:29:34,703] Trial 44 finished with value: 66.26270183000776 and parameters: {'learning_rate': 0.027584206561154415, 'num_leaves': 86, 'max_depth': 10, 'min_child_samples': 117, 'subsample': 0.8211776423206051, 'colsample_bytree': 0.6450832652459838}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:29<04:34,  3.11it/s][A

67.22484635253736
[I 2025-10-14 13:29:38,930] Trial 45 finished with value: 67.22484635253736 and parameters: {'learning_rate': 0.039442580887043124, 'num_leaves': 60, 'max_depth': 9, 'min_child_samples': 91, 'subsample': 0.7696691660157057, 'colsample_bytree': 0.6823546549302644}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104536 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:35<04:34,  3.11it/s][A

66.51776433896622
[I 2025-10-14 13:29:45,167] Trial 46 finished with value: 66.51776433896622 and parameters: {'learning_rate': 0.03243610148104007, 'num_leaves': 93, 'max_depth': 7, 'min_child_samples': 132, 'subsample': 0.9796736752995747, 'colsample_bytree': 0.8340840541667495}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:41<04:34,  3.11it/s][A

67.10404684489944
[I 2025-10-14 13:29:50,826] Trial 47 finished with value: 67.10404684489944 and parameters: {'learning_rate': 0.03551598658472757, 'num_leaves': 77, 'max_depth': 11, 'min_child_samples': 75, 'subsample': 0.9036707666102208, 'colsample_bytree': 0.7135624355634274}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101764 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:47<04:34,  3.11it/s][A

66.32829300812976
[I 2025-10-14 13:29:56,699] Trial 48 finished with value: 66.32829300812976 and parameters: {'learning_rate': 0.04093964154916281, 'num_leaves': 70, 'max_depth': 12, 'min_child_samples': 167, 'subsample': 0.8603583218242022, 'colsample_bytree': 0.6190381693228592}. Best is trial 10 with value: 65.65902808568586.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620196
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000



[A                                                             
train images:  15%|█▍        | 146/1000 [28:53<04:34,  3.11it/s][A

65.76955665396966
[I 2025-10-14 13:30:02,602] Trial 49 finished with value: 65.76955665396966 and parameters: {'learning_rate': 0.04539720653789773, 'num_leaves': 83, 'max_depth': 8, 'min_child_samples': 88, 'subsample': 0.9274434499607571, 'colsample_bytree': 0.8815935448687511}. Best is trial 10 with value: 65.65902808568586.
 Best parameters saved to /kaggle/working/optuna_studies/best_params.json


In [22]:
with open(best_params_file, 'r') as f:
    best_params = json.load(f)

best_params.update({
    'objective': 'mae',
    'boosting_type': 'gbdt',
    'random_state': 42,
    'n_jobs': -1
})

final_model = lgb.LGBMRegressor(**best_params)
# Train on the entire dataset for the final model
final_model.fit(X_train_all, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.120222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 620201
[LightGBM] [Info] Number of data points in the train set: 1000, number of used features: 2433
[LightGBM] [Info] Start training from score 14.950000


In [24]:
test_pred = final_model.predict(X_test_all)
test_pred[test_pred < 0] = 0

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': test_pred
})
# Submission file is saved to /kaggle/working/ which is the required output directory
submission.to_csv(os.path.join(KAGGLE_WORKING_PATH, 'submission.csv'), index=False)
print("\n submission.csv generated successfully in /kaggle/working/!")


 submission.csv generated successfully in /kaggle/working/!


In [27]:
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study).show()
plot_param_importances(study).show()

NameError: name 'smpe' is not defined