# Goal:

Try training LGBM with basic features (14d prediction)

Group features:
- same id, product id, name first, name second, categores, warehouse, warhouse location, combos
- all of these --> lag sales, rolling sales, etc.

Date features:
- dow, month, year, etc.
- is holiday

In [169]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from statsmodels.tsa.deterministic import (CalendarFourier,
                                           CalendarSeasonality,
                                           CalendarTimeTrend,
                                           DeterministicProcess)

In [180]:
DIRECTORY = './data/'
calendar = pd.read_csv(DIRECTORY + 'calendar.csv', parse_dates=['date'])
inventory = pd.read_csv(DIRECTORY + 'inventory.csv')
test = pd.read_csv(DIRECTORY + 'sales_test.csv', parse_dates=['date'])
train = pd.read_csv(DIRECTORY + 'sales_train.csv', parse_dates=['date'])
solution = pd.read_csv(DIRECTORY + 'solution.csv')
test_weights = pd.read_csv(DIRECTORY + 'test_weights.csv')

## Add Predictors

In [181]:
# Initial combined df
combined = train.merge(test_weights, on='unique_id', how='left') \
    .merge(inventory.drop(columns='warehouse'), on='unique_id', how='left') \
    .merge(calendar[['date', 'holiday_name', 'holiday', 'warehouse']], on=['date', 'warehouse'], how='left')
combined_no_holidays = combined[combined['holiday'] == 0].drop(columns=['holiday', 'holiday_name']).copy()
combined_no_holidays = combined_no_holidays.sort_values(by='date')
del calendar, inventory, test, train, solution, test_weights, combined

In [182]:
features = combined_no_holidays

In [183]:
# Add categorical features for groupings
def date_features(df):
    df = df.copy()
    df['day_of_year'] = df['date'].dt.day_of_year
    df['day_of_month'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

    yearly = CalendarFourier(freq='A', order=4).in_sample(df['date']).reset_index(drop=True)
    monthly = CalendarFourier(freq='M', order=2).in_sample(df['date']).reset_index(drop=True)
    df = pd.concat([df, yearly, monthly], axis=1)

    return df

date_added = date_features(features)

  index = pd.date_range("2020-01-01", freq=freq, periods=1)
  index = pd.date_range("2020-01-01", freq=freq, periods=1)


In [None]:
date_added

  index = pd.date_range("2020-01-01", freq=freq, periods=1)


Unnamed: 0,"sin(1,freq=YE-DEC)","cos(1,freq=YE-DEC)","sin(2,freq=YE-DEC)","cos(2,freq=YE-DEC)","sin(3,freq=YE-DEC)","cos(3,freq=YE-DEC)","sin(4,freq=YE-DEC)","cos(4,freq=YE-DEC)"
0,-0.492548,-0.870285,0.857315,0.514793,-0.999668,-0.025748,0.882679,-0.469977
1,-0.492548,-0.870285,0.857315,0.514793,-0.999668,-0.025748,0.882679,-0.469977
2,-0.492548,-0.870285,0.857315,0.514793,-0.999668,-0.025748,0.882679,-0.469977
3,-0.492548,-0.870285,0.857315,0.514793,-0.999668,-0.025748,0.882679,-0.469977
4,-0.492548,-0.870285,0.857315,0.514793,-0.999668,-0.025748,0.882679,-0.469977
...,...,...,...,...,...,...,...,...
3843948,0.492548,-0.870285,-0.857315,0.514793,0.999668,-0.025748,-0.882679,-0.469977
3843949,0.492548,-0.870285,-0.857315,0.514793,0.999668,-0.025748,-0.882679,-0.469977
3843950,0.492548,-0.870285,-0.857315,0.514793,0.999668,-0.025748,-0.882679,-0.469977
3843951,0.492548,-0.870285,-0.857315,0.514793,0.999668,-0.025748,-0.882679,-0.469977


In [154]:
# Lag features
def add_rolling_feature(df, groupby, shift_number):
    return df.groupby(groupby)['sales'].rolling(shift_number, closed='left').mean().reset_index(level=0, drop=True)

# grouping = [['unique_id'], ['']]
features['previous_sale'] = add_rolling_feature(features, ['unique_id'], 1)
# features['new_col'] = add_rolling_feature(features, ['unique_id'], 1)

In [155]:
## Final Processing
features.index = features['date'].dt.to_period('M')
features = features.sort_index()

def fix_df(df):
    categorical = list(df.select_dtypes(include='object').columns)
    for col in categorical:
        df[col] = df[col].astype('category')
    return df

features = fix_df(features)

## Testing LGBM

In [157]:
model = lgb.LGBMRegressor()

months = np.sort(features.index.unique())
test_month_indices = np.arange(-2, -10, -1)
test_month_indices = [-2]
for test_month_index in test_month_indices:
    test_month = months[test_month_index]
    train_end = test_month - 1

    train = features.loc[:train_end]
    test = features.loc[test_month]

    categorical = list(features.select_dtypes(include='category').columns)

    train_data = lgb.Dataset(train.drop(columns=['sales', 'weight']), label=train['sales'], weight=train['weight'], categorical_feature=categorical)
    test_data = lgb.Dataset(test.drop(columns=['sales', 'weight']), label=test['sales'], weight=test['weight'], categorical_feature=categorical)
    params = {"objective": "regression", "metric": "mse"}
    model = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[test_data],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=True),  # Stops if no improvement in 50 rounds
        lgb.log_evaluation(period=100)  # Logs every 10 iterations
    ]
)

DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Int16DType'>, <class 'numpy.dtypes.Int8DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float32DType'>)

In [34]:
combined_no_holidays.loc[test_month]

Unnamed: 0_level_0,unique_id,date,warehouse,total_orders,sales,sell_price_main,availability,type_0_discount,type_1_discount,type_2_discount,...,type_4_discount,type_5_discount,type_6_discount,weight,product_unique_id,name,L1_category_name_en,L2_category_name_en,L3_category_name_en,L4_category_name_en
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-10,2607,2023-10-30,Prague_3,4781.0,67.14,53.25,1.00,0.00000,0.0,0.0,...,0.0,0.0,0.00000,1.600825,1301,Bread_97,Bakery,Bakery_L2_9,Bakery_L3_75,Bakery_L4_15
2023-10,2801,2023-10-19,Frankfurt_1,1640.0,66.03,0.69,1.00,0.00000,0.0,0.0,...,0.0,0.0,0.00000,0.934970,1385,Radish_11,Fruit and vegetable,Fruit and vegetable_L2_3,Fruit and vegetable_L3_33,Fruit and vegetable_L4_1
2023-10,912,2023-10-14,Prague_1,9228.0,864.73,33.91,1.00,0.36144,0.0,0.0,...,0.0,0.0,0.00000,3.157181,472,Leek_1,Fruit and vegetable,Fruit and vegetable_L2_3,Fruit and vegetable_L3_63,Fruit and vegetable_L4_41
2023-10,2295,2023-10-30,Prague_3,4781.0,19.80,46.11,1.00,0.00000,0.0,0.0,...,0.0,0.0,0.00000,1.642732,1152,Pastry_96,Bakery,Bakery_L2_14,Bakery_L3_26,Bakery_L4_1
2023-10,3192,2023-10-15,Brno_1,7513.0,53.92,88.06,0.41,0.00000,0.0,0.0,...,0.0,0.0,0.00000,7.715255,1576,Plum_17,Fruit and vegetable,Fruit and vegetable_L2_1,Fruit and vegetable_L3_102,Fruit and vegetable_L4_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10,1453,2023-10-07,Prague_1,8994.0,144.39,167.75,1.00,0.00000,0.0,0.0,...,0.0,0.0,0.00000,11.050133,741,Chicken_31,Meat and fish,Meat and fish_L2_13,Meat and fish_L3_32,Meat and fish_L4_7
2023-10,1089,2023-10-06,Prague_2,6284.0,23.25,145.70,1.00,0.00000,0.0,0.0,...,0.0,0.0,0.25008,5.709209,558,Chicken_25,Meat and fish,Meat and fish_L2_13,Meat and fish_L3_27,Meat and fish_L4_5
2023-10,76,2023-10-30,Brno_1,7936.0,16.33,281.33,0.31,0.00000,0.0,0.0,...,0.0,0.0,0.00000,16.337250,34,Pork_0,Meat and fish,Meat and fish_L2_15,Meat and fish_L3_28,Meat and fish_L4_6
2023-10,5327,2023-10-11,Prague_2,5528.0,11.35,357.72,1.00,0.00000,0.0,0.0,...,0.0,0.0,0.00000,13.401616,2618,Beef_28,Meat and fish,Meat and fish_L2_26,Meat and fish_L3_72,Meat and fish_L4_21
