## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import optuna

import warnings
warnings.filterwarnings("ignore")

## Load Datasets

In [2]:
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
except:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

## Train Data Preprocessing

In [3]:
#Convert datatime column into year, month, day, day of week
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
#train_df['dow'] = train_df['date'].dt.dayofweek

#Delete columns
train_df = train_df.drop(columns=['date', 'id'])

#Delete NaN values
train_df = train_df.dropna(subset=['num_sold'])
train_df['num_sold'] = np.log1p(train_df['num_sold'])

display(train_df.head())
print()
train_df.info()

Unnamed: 0,country,store,product,num_sold,year,month,day
1,Canada,Discount Stickers,Kaggle,6.881411,2010,1,1
2,Canada,Discount Stickers,Kaggle Tiers,6.810142,2010,1,1
3,Canada,Discount Stickers,Kerneler,6.049733,2010,1,1
4,Canada,Discount Stickers,Kerneler Dark Mode,6.198479,2010,1,1
5,Canada,Stickers for Less,Holographic Goose,5.70711,2010,1,1



<class 'pandas.core.frame.DataFrame'>
Index: 221259 entries, 1 to 230129
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   221259 non-null  object 
 1   store     221259 non-null  object 
 2   product   221259 non-null  object 
 3   num_sold  221259 non-null  float64
 4   year      221259 non-null  int32  
 5   month     221259 non-null  int32  
 6   day       221259 non-null  int32  
dtypes: float64(1), int32(3), object(3)
memory usage: 11.0+ MB


## Split Data: Train and Validation Datasets

In [4]:
RANDOM_STATE = 42
TEST_SIZE = 0.25

X = train_df.drop(columns='num_sold')
y = train_df['num_sold']

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size = TEST_SIZE, 
    random_state = RANDOM_STATE)

print(X_train.shape, X_val.shape)

(165944, 6) (55315, 6)


In [5]:
cat_col_names = list(X_train.select_dtypes(include=['object']).columns)
num_col_names = list(X_train.select_dtypes(include=['number']).columns)
print(f'Categorical_features: {cat_col_names}')
print(f'Numerical features: {num_col_names}')

Categorical_features: ['country', 'store', 'product']
Numerical features: ['year', 'month', 'day']


## Encode Features

In [6]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train_ohe = encoder.fit_transform(X_train[cat_col_names])
X_val_ohe = encoder.transform(X_val[cat_col_names])
encoder_col_names = encoder.get_feature_names_out()
X_train_ohe = pd.DataFrame(X_train_ohe, columns=encoder_col_names)
X_val_ohe = pd.DataFrame(X_val_ohe, columns=encoder_col_names)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train[num_col_names])
X_val_scaled = scaler.transform(X_val[num_col_names])
X_train_scaled = pd.DataFrame(X_train_scaled, columns=num_col_names)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=num_col_names)

X_train = pd.concat([X_train_ohe, X_train_scaled], axis=1)
X_val = pd.concat([X_val_ohe, X_val_scaled], axis=1)

In [7]:
#Load the train and validation data into the LightGBM dataset object
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

## Optuna Study Object

In [8]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', -1, 20),  # -1 означает отсутствие ограничения
        #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        #'min_sum_hessian_in_leaf': trial.suggest_loguniform('min_sum_hessian_in_leaf', 1e-3, 10.0),
        #'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        #'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        #'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        #'max_bin': trial.suggest_int('max_bin', 100, 500)
    }

    model = lgb.LGBMRegressor(**params, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    return mean_absolute_percentage_error(y_val, predictions)

In [9]:
study = optuna.create_study(study_name="LGBM_Kaggle", direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2025-01-12 16:13:26,165] A new study created in memory with name: LGBM_Kaggle


  0%|          | 0/30 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 165944, number of used features: 14
[LightGBM] [Info] Start training from score 5.936026
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 165944, number of used features: 14
[LightGBM] [Info] Start training from score 5.936026
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

In [10]:
final_params = study.best_params
final_params

{'boosting_type': 'gbdt',
 'learning_rate': 0.09860380337639357,
 'num_leaves': 141,
 'max_depth': 14}

In [11]:
final_params = study.best_params
final_model = lgb.LGBMRegressor(**final_params, random_state=RANDOM_STATE)
final_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 165944, number of used features: 14
[LightGBM] [Info] Start training from score 5.936026


In [12]:
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['year'] = test_df['date'].dt.year
test_df['month'] = test_df['date'].dt.month
test_df['day'] = test_df['date'].dt.day

X_test_ohe = encoder.transform(test_df[cat_col_names])
X_test_ohe = pd.DataFrame(X_test_ohe, columns=encoder_col_names)

X_test_scaled = scaler.transform(test_df[num_col_names])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=num_col_names)

X_test = pd.concat([X_test_ohe, X_test_scaled], axis=1)

In [13]:
final_preds = final_model.predict(X_test)
pd.DataFrame({'id': test_df['id'], 'num_sold': np.expm1(final_preds)}).to_csv("submission.csv", index=False)