In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import lightgbm as lgb

In [3]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [4]:
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
	sys.path.insert(0, src_path)

from data_loader import load_data , load_clean_data
from features import add_time_features, create_target

In [5]:
df = load_clean_data("../data/processed/data.csv")

In [6]:
df.head()

Unnamed: 0,Country,StockCode,Year,Month,Day,Hour,min,sec,is_weekend,target,InvoiceDate
0,United Kingdom,0.465631,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
1,United Kingdom,0.517342,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
2,United Kingdom,0.551318,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
3,United Kingdom,0.39055,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
4,United Kingdom,0.204992,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00


In [7]:
X = df.drop(columns="target")
y = df["target"]

In [8]:
train_mask = df["InvoiceDate"] <= "2010-06-30"
val_mask   = (df["InvoiceDate"] > "2010-06-30") & (df["InvoiceDate"] <= "2010-09-30")
test_mask  = df["InvoiceDate"] > "2010-09-30"


In [9]:
X = X.drop('InvoiceDate', axis=1)
X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask], y[val_mask]
X_test,  y_test  = X[test_mask], y[test_mask]

In [10]:
import optuna
import sklearn.metrics

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
CATEGORICAL_FEATURES = [
    'Country',
    'StockCode'
]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 11 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Country      1067371 non-null  object 
 1   StockCode    1067371 non-null  float64
 2   Year         1067371 non-null  int64  
 3   Month        1067371 non-null  int64  
 4   Day          1067371 non-null  int64  
 5   Hour         1067371 non-null  int64  
 6   min          1067371 non-null  int64  
 7   sec          1067371 non-null  int64  
 8   is_weekend   1067371 non-null  int64  
 9   target       1067371 non-null  int64  
 10  InvoiceDate  1067371 non-null  object 
dtypes: float64(1), int64(8), object(2)
memory usage: 89.6+ MB


In [13]:
# List of columns identified in the error
# Convert to category type
for col in CATEGORICAL_FEATURES:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[col] = X_val[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
T

In [14]:
# Replace StockCode with its frequency count
enc = X_train['StockCode'].value_counts()
X_train['StockCode_Freq'] = X_train['StockCode'].map(enc)
X_val['StockCode_Freq'] = X_val['StockCode'].map(enc).fillna(0)

# Drop the original high-cardinality column
X_train.drop('StockCode', axis=1, inplace=True)
X_val.drop('StockCode', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['StockCode_Freq'] = X_train['StockCode'].map(enc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val['StockCode_Freq'] = X_val['StockCode'].map(enc).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop('StockCode', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the do

In [15]:
X_val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108572 entries, 255103 to 363674
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   Country         108572 non-null  category
 1   Year            108572 non-null  int64   
 2   Month           108572 non-null  int64   
 3   Day             108572 non-null  int64   
 4   Hour            108572 non-null  int64   
 5   min             108572 non-null  int64   
 6   sec             108572 non-null  int64   
 7   is_weekend      108572 non-null  int64   
 8   StockCode_Freq  108572 non-null  float64 
dtypes: category(1), float64(1), int64(7)
memory usage: 7.6 MB


In [16]:
CATEGORICAL_FEATURES = [
    'Country'
]

In [17]:
df_test = X_test.copy()
df_test["target"] = y_test

df_train = X_train.copy()
df_train["target"] = y_train

df_val = X_val.copy()
df_val["target"] = y_val

In [18]:
df_val.head()

Unnamed: 0,Country,Year,Month,Day,Hour,min,sec,is_weekend,StockCode_Freq,target
255103,United Kingdom,2010,6,30,8,14,0,0,3.0,0
255104,United Kingdom,2010,6,30,8,14,0,0,29.0,0
255105,United Kingdom,2010,6,30,8,14,0,0,15.0,0
255106,United Kingdom,2010,6,30,8,14,0,0,22.0,0
255107,United Kingdom,2010,6,30,8,14,0,0,25.0,0


In [19]:
known_countries = set(df_train["Country"].unique())

def map_country(x):
    return x if x in known_countries else "UNKNOWN"

df["Country"] = df["Country"].apply(map_country)


In [20]:
df.to_csv('../data/processed/data.csv')

In [21]:
df.head()

Unnamed: 0,Country,StockCode,Year,Month,Day,Hour,min,sec,is_weekend,target,InvoiceDate
0,United Kingdom,0.465631,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
1,United Kingdom,0.517342,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
2,United Kingdom,0.551318,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
3,United Kingdom,0.39055,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00
4,United Kingdom,0.204992,2009,12,1,7,45,0,0,1,2009-12-01 07:45:00


In [22]:
df_test.to_csv("../data/processed/test.csv")
df_val.to_csv("../data/processed/val.csv")
df_train.to_csv("../data/processed/train.csv")

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 255103 entries, 0 to 255102
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   Country         255103 non-null  category
 1   Year            255103 non-null  int64   
 2   Month           255103 non-null  int64   
 3   Day             255103 non-null  int64   
 4   Hour            255103 non-null  int64   
 5   min             255103 non-null  int64   
 6   sec             255103 non-null  int64   
 7   is_weekend      255103 non-null  int64   
 8   StockCode_Freq  255103 non-null  int64   
dtypes: category(1), int64(8)
memory usage: 17.8 MB


In [24]:
# 1. Create Datasets ONCE (Outside the objective function)

dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=CATEGORICAL_FEATURES)
dvalid = lgb.Dataset(X_val, label=y_val, categorical_feature=CATEGORICAL_FEATURES, reference=dtrain)


def objective(trial):
    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        'is_unbalance': True,
        "feature_pre_filter": False,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True), # Added this!
        "device": "gpu"
    }

    # 2. Add Early Stopping to save time and prevent overfitting
    gbm = lgb.train(
        param, 
        dtrain, 
        valid_sets=[dvalid],
        num_boost_round=1000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=0) # Keeps the console clean
        ]
    )
    
    preds = gbm.predict(X_val)
    pred_labels = (preds >= 0.5).astype(int)
    accuracy = sklearn.metrics.accuracy_score(y_val, pred_labels)
    return accuracy

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(objective , n_trials=10)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2026-02-10 20:48:13,075] A new study created in memory with name: no-name-a9d776f6-0ced-4c27-ba3c-52157cdb765a


Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:48:16,882] Trial 0 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 0.00016553097634725272, 'lambda_l2': 0.1864410313075773, 'num_leaves': 123, 'feature_fraction': 0.43812463637746557, 'bagging_fraction': 0.899354786438088, 'bagging_freq': 6, 'min_child_samples': 53, 'learning_rate': 0.014880278839476957}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[12]	valid_0's binary_logloss: 0.534886
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's binary_logloss: 0.531807


[I 2026-02-10 20:49:08,715] Trial 1 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 1.0424900131739931e-08, 'lambda_l2': 2.0738555600936612e-08, 'num_leaves': 252, 'feature_fraction': 0.8974813503797718, 'bagging_fraction': 0.8007621866441594, 'bagging_freq': 6, 'min_child_samples': 9, 'learning_rate': 0.0038306131415029045}. Best is trial 0 with value: 0.771874884869027.


Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:25,584] Trial 2 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 2.564086387882776e-07, 'lambda_l2': 1.4958593712669562e-07, 'num_leaves': 126, 'feature_fraction': 0.6726132829995259, 'bagging_fraction': 0.7260654595305716, 'bagging_freq': 3, 'min_child_samples': 51, 'learning_rate': 0.019160769545198523}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[7]	valid_0's binary_logloss: 0.533124
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:42,444] Trial 3 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 2.417043688060674e-08, 'lambda_l2': 1.032971297171643e-05, 'num_leaves': 57, 'feature_fraction': 0.9446124707322061, 'bagging_fraction': 0.9461955299602381, 'bagging_freq': 4, 'min_child_samples': 32, 'learning_rate': 0.0015436176887392741}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[111]	valid_0's binary_logloss: 0.531059
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:43,081] Trial 4 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 0.0030816331099536806, 'lambda_l2': 0.024818131367840852, 'num_leaves': 3, 'feature_fraction': 0.9458953973582122, 'bagging_fraction': 0.5958179442065534, 'bagging_freq': 4, 'min_child_samples': 63, 'learning_rate': 0.002286248569329274}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[56]	valid_0's binary_logloss: 0.534278
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:45,796] Trial 5 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 2.9093209383548097e-08, 'lambda_l2': 3.144818003132856, 'num_leaves': 125, 'feature_fraction': 0.6382921365174583, 'bagging_fraction': 0.8792214979624051, 'bagging_freq': 2, 'min_child_samples': 71, 'learning_rate': 0.060768578423142025}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[2]	valid_0's binary_logloss: 0.532726
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:47,442] Trial 6 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 1.2777590480906528e-06, 'lambda_l2': 0.6086832261793683, 'num_leaves': 66, 'feature_fraction': 0.44696717237496114, 'bagging_fraction': 0.7987457308790864, 'bagging_freq': 2, 'min_child_samples': 87, 'learning_rate': 0.09605406444692535}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.535133
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:52,279] Trial 7 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 3.1485426457606593e-06, 'lambda_l2': 0.0010455660707634278, 'num_leaves': 252, 'feature_fraction': 0.8724818094959567, 'bagging_fraction': 0.5277889134617072, 'bagging_freq': 2, 'min_child_samples': 20, 'learning_rate': 0.054283405907522786}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[3]	valid_0's binary_logloss: 0.532039
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:55,077] Trial 8 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 0.21433229005421667, 'lambda_l2': 0.4657594602472436, 'num_leaves': 118, 'feature_fraction': 0.9745544388046199, 'bagging_fraction': 0.5658221865204718, 'bagging_freq': 6, 'min_child_samples': 83, 'learning_rate': 0.01796953018451152}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[10]	valid_0's binary_logloss: 0.530715
Training until validation scores don't improve for 50 rounds


[I 2026-02-10 20:49:56,999] Trial 9 finished with value: 0.771874884869027 and parameters: {'lambda_l1': 1.0047700377557496e-05, 'lambda_l2': 0.00018569863384222102, 'num_leaves': 32, 'feature_fraction': 0.5368002300723055, 'bagging_fraction': 0.7331943917391051, 'bagging_freq': 2, 'min_child_samples': 80, 'learning_rate': 0.003327106732601631}. Best is trial 0 with value: 0.771874884869027.


Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.534062
Number of finished trials: 10
Best trial:
  Value: 0.771874884869027
  Params: 
    lambda_l1: 0.00016553097634725272
    lambda_l2: 0.1864410313075773
    num_leaves: 123
    feature_fraction: 0.43812463637746557
    bagging_fraction: 0.899354786438088
    bagging_freq: 6
    min_child_samples: 53
    learning_rate: 0.014880278839476957


In [26]:
from pathlib import Path

Path("../models").mkdir(parents=True, exist_ok=True)



In [28]:
best_params = study.best_params
best_params.update({
    "objective": "binary",
    "metric": "binary_logloss",
    "verbose": -1,
    "boosting_type": "gbdt",
    "device": "gpu",
})

dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=CATEGORICAL_FEATURES)
gbm = lgb.train(best_params, dtrain)

gbm.save_model("../models/lgbm_optuna.txt")


<lightgbm.basic.Booster at 0x1a314c0cc50>