# Table of Contents


1. [Import Libraries](#chapter1)
2. [Data Loading & Description](#chapter2)
3. [Data Preprocessing](#chapter3)
4. [Split Data & Build Pipeline](#chapter4)
```
4. [Exploratory Data Analysis (EDA)](#eda)
5. [Data Preprocessing](#data-preprocessing)
6. [Feature Engineering](#feature-engineering)
7. [Model Selection](#model-selection)
8. [Model Training](#model-training)
9. [Model Evaluation](#model-evaluation)
10. [Results and Discussion](#results-and-discussion)
11. [Conclusions](#conclusions)
12. [References](#references)
```


<a class="anchor" id="chapter1"></a>

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import optuna
import shap

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error

from optuna.integration import OptunaSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from optuna import distributions
import optuna
import shap

In [17]:
RANDOM_STATE = 42
TEST_SIZE = 0.25

<a class="anchor" id="chapter2"></a>
## Data Loading & Description

In [69]:
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
except:
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

In [70]:
def get_info(df):
    print(f"\n{type(df).__name__} shape: {df.shape}")
    print(f"\n{df.shape[0]:,.0f} rows")
    print(f"\n{df.shape[1]:,.0f} columns")
    print(f'\nMissing Data: \n{df.isnull().sum()}')
    print(f'\nDuplicates: {df.duplicated().sum()}')
    print(f'\nData Types: \n{df.dtypes}')                                         


display(train_df.head(5))
get_info(train_df)

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0



DataFrame shape: (230130, 6)

230,130 rows

6 columns

Missing Data: 
id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64

Duplicates: 0

Data Types: 
id            int64
date         object
country      object
store        object
product      object
num_sold    float64
dtype: object


<a class="anchor" id="chapter3"></a>
## Data Preprocessing

In [71]:
def convert_date(df):
    df['date'] = pd.to_datetime(df['date']) 
    df['year'] = df['date'].dt.year  
    df['month'] = df['date'].dt.month 
    df['month_name'] = df['date'].dt.month_name()
    df['day_of_week'] = df['date'].dt.day_name()
    df['day_of_year'] = df['date'].dt.dayofyear
    

    
    df["month_sin"] = np.sin(df['month'] * (2 * np.pi / 12))
    df["month_cos"] = np.cos(df['month'] * (2 * np.pi / 12))
    df['day_sin'] = np.sin(df['day_of_year'] * (2 * np.pi /  365))
    df['day_cos'] = np.cos(df['day_of_year'] * (2 * np.pi /  365))
    df['year_sin'] = np.sin(2 * np.pi * df['year'] / 7)
    df['year_cos'] = np.cos(2 * np.pi * df['year'] / 7)
    df['quarter'] = df['date'].dt.quarter

    df.drop(columns=['id', 'date', 'year', 'month', 'day_of_year'], inplace=True)
    return df

train_df = convert_date(train_df)

#Fill NaN values with mean
train_df['num_sold'] = train_df.groupby(['country', 'product'])['num_sold'].transform(lambda x: x.fillna(x.mean()))
train_df['num_sold'] = np.log1p(train_df['num_sold'])

display(train_df.head(5))
print(f'\nMissing Data: \n{train_df.isnull().sum()}')

Unnamed: 0,country,store,product,num_sold,month_name,day_of_week,month_sin,month_cos,day_sin,day_cos,year_sin,year_cos,quarter
0,Canada,Discount Stickers,Holographic Goose,5.489967,January,Friday,0.5,0.866025,0.017213,0.999852,0.781831,0.62349,1
1,Canada,Discount Stickers,Kaggle,6.881411,January,Friday,0.5,0.866025,0.017213,0.999852,0.781831,0.62349,1
2,Canada,Discount Stickers,Kaggle Tiers,6.810142,January,Friday,0.5,0.866025,0.017213,0.999852,0.781831,0.62349,1
3,Canada,Discount Stickers,Kerneler,6.049733,January,Friday,0.5,0.866025,0.017213,0.999852,0.781831,0.62349,1
4,Canada,Discount Stickers,Kerneler Dark Mode,6.198479,January,Friday,0.5,0.866025,0.017213,0.999852,0.781831,0.62349,1



Missing Data: 
country        0
store          0
product        0
num_sold       0
month_name     0
day_of_week    0
month_sin      0
month_cos      0
day_sin        0
day_cos        0
year_sin       0
year_cos       0
quarter        0
dtype: int64


In [72]:
# Define features and target
numerical_features = ['month_sin', 'month_cos', 'day_sin', 
                      'day_cos', 'year_sin', 'year_cos', 'quarter']

categorical_features = ['country', 'store', 'product', 'month_name', 'day_of_week']

target_column = 'num_sold'

<a class="anchor" id="chapter4"></a>
## Split Data & Build Pipeline

In [73]:
X = train_df.drop(columns=target_column)
y = train_df[target_column]

ohe_pipe = Pipeline([
        ('simpleImputer_ohe', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ])

data_preprocessor = ColumnTransformer([
    ('ohe', ohe_pipe, categorical_features),
    ('num', StandardScaler(), numerical_features)
])

print(data_preprocessor)

X_preprocessed = data_preprocessor.fit_transform(X)
test_copy = test_df.copy()
test_preprocessed = data_preprocessor.transform(convert_date(test_df))

X_train, X_val, y_train, y_val = train_test_split(
    X_preprocessed,
    y,
    test_size = TEST_SIZE, 
    random_state = RANDOM_STATE)

print(X_train.shape, X_val.shape)

ColumnTransformer(transformers=[('ohe',
                                 Pipeline(steps=[('simpleImputer_ohe',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ohe',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore',
                                                                sparse_output=False))]),
                                 ['country', 'store', 'product', 'month_name',
                                  'day_of_week']),
                                ('num', StandardScaler(),
                                 ['month_sin', 'month_cos', 'day_sin',
                                  'day_cos', 'year_sin', 'year_cos',
                                  'quarter'])])
(172597, 35) (57533, 35)


In [74]:
#Load the train and validation data into the LightGBM dataset object
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'mape',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'boosting_type': "gbdt",
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', -1, 20),  
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1.0),
        "verbosity": -1,
        'seed': 42,
        'device': 'gpu'

    }
    # Train LightGBM model
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_eval],
    )

    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    mape = mean_absolute_percentage_error(y_val, y_val_pred)
    return mape


study = optuna.create_study(study_name="LGBM_Kaggle", direction='minimize')
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=40, show_progress_bar=True)

# Best parameters and MAPE
print("Best parameters:", study.best_params)
print("Best MAPE:", study.best_value)

  0%|          | 0/40 [00:00<?, ?it/s]

Best parameters: {'n_estimators': 996, 'learning_rate': 0.033370824463028796, 'num_leaves': 124, 'max_depth': 12, 'reg_alpha': 0.0008353834116407559}
Best MAPE: 0.00847475554952956


In [None]:
final_params = study.best_params
final_model = lgb.LGBMRegressor(**final_params, random_state=RANDOM_STATE)
final_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])

In [78]:
final_preds = final_model.predict(test_preprocessed)
pd.DataFrame({'id': test_copy['id'], 'num_sold': np.expm1(final_preds)}).to_csv("submission.csv", index=False)