In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import lightgbm as lgb
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, acf

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.impute import KNNImputer
from scipy.stats import uniform, randint
from datetime import timedelta

from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')



# **Read Data**

In [2]:
from google.colab import drive
drive.mount('/drive/', force_remount=True)

Mounted at /drive/


In [3]:
cd /drive/My\ Drive/Colab\ Notebooks/CSV

/drive/My Drive/Colab Notebooks/CSV


In [4]:
bm = pd.read_csv('BritishMart.csv')
sm = pd.read_csv('ScottishMart.csv')
ukm = pd.read_csv('UKMart.csv')
cvm = pd.read_csv('CVanyMart.csv')
cvf = pd.read_csv('CVFamz.csv')
pd.set_option('display.max_colwidth', None)

# df_toko = pd.concat([bm, sm, ukm, cvm, cvf], ignore_index=True)

In [5]:
# Rename ScottishMart['harga'] into 'harga_pound'
sm = sm.rename(columns={'harga': 'harga_pound'})

In [6]:
# Map sesuai perubahan
unit_mapping = {'kg': 'kg', 'l': 'litre', 'unit': 'ounces', 'm': 'meter'}

bm['unit'] = bm['unit'].replace(unit_mapping)
sm['unit'] = sm['unit'].replace(unit_mapping)
ukm['unit'] = ukm['unit'].replace(unit_mapping)
cvm['unit'] = cvm['unit'].replace(unit_mapping)
cvf['unit'] = cvf['unit'].replace(unit_mapping)

# **Data Cleaning**

## **Drop Missing Values**

In [7]:
# Drop all empty values
bm.dropna(inplace=True)
sm.dropna(inplace=True)
ukm.dropna(inplace=True)
cvm.dropna(inplace=True)
cvf.dropna(inplace=True)

## **Standarize Category**

In [8]:
# Product category standardization
bm[bm['nama'] == 'Alpro Almond Chilled Drink'].head(10)

Unnamed: 0,nama_toko,harga_pound,harga_per_unit,unit,nama,kategori,brand_sendiri,tanggal
1,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,drinks,False,2021-01-01
16332,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-01-20
36698,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,drinks,False,2021-02-13
60982,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,fresh_food,False,2021-03-13
89688,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,fresh_food,False,2021-04-16
109517,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,fresh_food,False,2021-05-10
125376,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,fresh_food,False,2021-05-28
152528,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,drinks,False,2021-06-29
167763,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,drinks,False,2021-07-17
189559,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,drinks,False,2021-08-12


In [9]:
def agg_mode(df, column, target):
    mode_kategori = df.groupby(column)[target].agg(lambda x: x.mode()[0])
    df = pd.merge(df, mode_kategori, how='left', on=column, suffixes=('', '_mode'))
    df['kategori'] = df['kategori_mode']
    df.drop(columns=['kategori_mode'], inplace=True)
    return df

bm = agg_mode(bm, 'nama', 'kategori')
sm = agg_mode(sm, 'nama', 'kategori')
ukm = agg_mode(ukm, 'nama', 'kategori')
cvm = agg_mode(cvm, 'nama', 'kategori')
cvf = agg_mode(cvf, 'nama', 'kategori')

In [10]:
bm[bm['nama'] == 'Alpro Almond Chilled Drink'].head(10)

Unnamed: 0,nama_toko,harga_pound,harga_per_unit,unit,nama,kategori,brand_sendiri,tanggal
1,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-01-01
16332,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-01-20
36698,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-02-13
60982,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-03-13
89688,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-04-16
109517,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-05-10
125376,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-05-28
152528,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-06-29
167763,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-07-17
189559,BritishMart,2.1,2.1,litre,Alpro Almond Chilled Drink,free-from,False,2021-08-12


## **Clean Data By Brand Ownership**

In [11]:
def clean_dataframe(df):
    cleaned_df = df[['harga_per_unit', 'nama', 'tanggal', 'brand_sendiri', 'kategori']]
    cleaned_df['tanggal'] = pd.to_datetime(cleaned_df['tanggal'])
    cleaned_df.set_index('tanggal', inplace=True)
    cleaned_df.sort_index(inplace=True)
    return cleaned_df

# Clean DataFrame
# store = pd.concat([bm, sm, ukm, cvm, cvf])
# # store = bm.copy()
# store_cleaned = clean_dataframe(store)
bm_cleaned = clean_dataframe(bm)
sm_cleaned = clean_dataframe(sm)
ukm_cleaned = clean_dataframe(ukm)
cvm_cleaned = clean_dataframe(cvm)
cvf_cleaned = clean_dataframe(cvf)

In [12]:
bm_cleaned

Unnamed: 0_level_0,harga_per_unit,nama,brand_sendiri,kategori
tanggal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01,1.35,Alpro Oat Long Life Drink,False,drinks
2021-01-01,2.10,Alpro Soya Chocolate Chilled Drink,False,free-from
2021-01-01,2.70,Cawston Press Apple & Ginger Juice,False,drinks
2021-01-01,4.23,Dunn's River Nurishment Vanilla Milk,False,drinks
2021-01-01,3.20,Lipton Ice Tea Raspberry,False,drinks
...,...,...,...,...
2024-02-29,4.00,Harry Potter Paper Stationery Pouch,False,home
2024-02-29,5.00,Hinkler Animal Rock Painting Kit,False,home
2024-02-29,24.00,Hp 300 Colour Ink Cartridge Bb,False,home
2024-02-29,2.00,Egyptian Cotton Silver Grey Towel,False,home


In [13]:
def create_pivot(df):
    pivot = df.pivot_table(index=df.index, columns='kategori', values='harga_per_unit', aggfunc='mean')
    pivot.columns = [f'kategori_{col}' for col in pivot.columns]
    return pivot.fillna(0)

bm_pivot = create_pivot(bm_cleaned)
sm_pivot = create_pivot(sm_cleaned)
ukm_pivot = create_pivot(ukm_cleaned)
cvm_pivot = create_pivot(cvm_cleaned)
cvf_pivot = create_pivot(cvf_cleaned)

In [14]:
bm_pivot.rename(columns={'kategori_free-from': 'kategori_free_from'}, inplace=True)
ukm_pivot.rename(columns={'kategori_free-from': 'kategori_free_from'}, inplace=True)
cvm_pivot.rename(columns={'kategori_free-from': 'kategori_free_from'}, inplace=True)
cvf_pivot.rename(columns={'kategori_free-from': 'kategori_free_from'}, inplace=True)

In [15]:
def select_and_rename_target_column(df, target_column):
    if target_column not in df.columns:
        raise ValueError(f"Column {target_column} does not exist in the DataFrame")
    df_target = df[[target_column]].copy()
    df_target = df_target.rename(columns={target_column: 'target'})
    return df_target

bm_pivot_health = select_and_rename_target_column(bm_pivot, 'kategori_health_products')
sm_pivot_health = select_and_rename_target_column(sm_pivot, 'kategori_health_products')
ukm_pivot_health = select_and_rename_target_column(ukm_pivot, 'kategori_health_products')
cvm_pivot_health = select_and_rename_target_column(cvm_pivot, 'kategori_health_products')
cvf_pivot_health = select_and_rename_target_column(cvf_pivot, 'kategori_health_products')

# **EDA**

## **Plot**

In [16]:
dataframes = {
    'bm_pivot_health': bm_pivot_health,
    'sm_pivot_health': sm_pivot_health,
    'ukm_pivot_health': ukm_pivot_health,
    'cvm_pivot_health': cvm_pivot_health,
    'cvf_pivot_health': cvf_pivot_health,
}
fig = go.Figure()

for name, df in dataframes.items():
    fig.add_trace(go.Scatter(x=df.index, y=df['target'], mode='lines', name=name))

fig.update_layout(title='Target Values from Different Dataframes', xaxis_title='Index', yaxis_title='Target', legend_title='Dataframes')
fig.show()

## **Stationarity Check**

In [17]:
for name, df in dataframes.items():
    result = seasonal_decompose(df['target'], period=13, model='additive')
    trend = result.trend
    seasonal = result.seasonal

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=(f"{name} Trend", f"{name} Seasonal"))

    fig.add_trace(go.Scatter(x=df.index, y=trend, mode='lines', name='Trend'), row=1, col=1)
    fig.add_trace(go.Scatter(x=df.index, y=seasonal, mode='lines', name='Seasonal'), row=2, col=1)

    fig.update_layout(height=800, title_text=f"{name} Time Series Decomposition", template='plotly_white')

    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=1, col=1)
    fig.update_yaxes(title_text="Value", row=2, col=1)

    fig.show()

## **Autocorrelation**

In [18]:
for name, df in dataframes.items():
    acf_values = acf(df['target'], nlags=60)
    lags = np.arange(len(acf_values))

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=lags,
        y=acf_values,
        name='ACF',
        marker_color='blue'
    ))
    fig.update_layout(title=f'Autocorrelation Function (ACF) for {name}',
                      xaxis_title='Lag',
                      yaxis_title='ACF Value',
                      template='plotly_white')
    fig.show()

## **AD-Fuller Test**

In [19]:
def adf_test(series, column_name, df_name):
    result = adfuller(series)
    adf_result = {
        'ADF Statistic': result[0],
        'p-value': result[1],
        '1% Critical Value': result[4]['1%'],
        '5% Critical Value': result[4]['5%'],
        '10% Critical Value': result[4]['10%'],
        'Dataframe': df_name,
        'Column': column_name
    }
    return adf_result

adf_results = []
for name, df in dataframes.items():
    df = df.sort_index()

    result = adf_test(df['target'], 'target', name)
    adf_results.append(result)

adf_results_df = pd.DataFrame(adf_results)
adf_results_df

Unnamed: 0,ADF Statistic,p-value,1% Critical Value,5% Critical Value,10% Critical Value,Dataframe,Column
0,-4.46628,0.000226,-3.436145,-2.864099,-2.568132,bm_pivot_health,target
1,-5.473482,2e-06,-3.436145,-2.864099,-2.568132,sm_pivot_health,target
2,-3.957577,0.00165,-3.436145,-2.864099,-2.568132,ukm_pivot_health,target
3,-32.399147,0.0,-3.436029,-2.864048,-2.568105,cvm_pivot_health,target
4,-3.849978,0.002435,-3.436145,-2.864099,-2.568132,cvf_pivot_health,target


# **Data Preprocessing**

## **Handle Outliers**

In [20]:
def handle_outliers(df, column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1

    upper_lim = q3 + (1.5 * iqr)
    lower_lim = q1 - (1.5 * iqr)

    df_handled = df.loc[(df[column_name] < upper_lim) & (df[column_name] > lower_lim)]
    return df_handled, len(df), len(df_handled)

for name, df in dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len

    print(f'{name} - before: {original_len}, after: {handled_len}, outliers: {outliers_count}')

    dataframes[name] = df_handled

bm_pivot_health - before: 1155, after: 1124, outliers: 31
sm_pivot_health - before: 1155, after: 1004, outliers: 151
ukm_pivot_health - before: 1155, after: 1130, outliers: 25
cvm_pivot_health - before: 1155, after: 1091, outliers: 64
cvf_pivot_health - before: 1155, after: 1028, outliers: 127


## **Impute Empty Date With KNN**

In [21]:
def fill_missing_dates(df, fill_value=np.nan):
    df.index = pd.to_datetime(df.index)
    full_date_range = pd.date_range(start=df.index.min(), end=df.index.max())
    df = df.reindex(full_date_range, fill_value=fill_value)
    return df
imputer = KNNImputer(n_neighbors=3)

imputed_dataframes = {}
for name, df in dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)

    new_name = name.replace('pivot_health', 'pivot_imputed')
    imputed_dataframes[new_name] = df_imputed

In [22]:
imputed_dataframes['bm_pivot_imputed']

Unnamed: 0,target
2021-01-01,3.600000
2021-01-02,7.000000
2021-01-03,0.000000
2021-01-04,17.045455
2021-01-05,0.000000
...,...
2024-02-25,39.062632
2024-02-26,41.569630
2024-02-27,34.018250
2024-02-28,8.044410


## **Feature Creation**

In [23]:
def create_features(data):
    data['dayofweek']= data.index.dayofweek
    data['quarter']= data.index.quarter
    data['month']= data.index.month
    data['year']= data.index.year
    data['dayofyear']= data.index.dayofyear
    return data

def create_lag_feature(df, column_name, lags):
    for lag in lags:
        df[f'lag_{lag}'] = df[column_name].shift(lag)
    df = df.fillna(0)
    return df

def create_covid(df, time):
  df_covid = df.copy()
  df_covid['covid'] = np.where(df_covid.index <= time, 1, 0)
  return df_covid

In [24]:
features_created_dataframes = {}
lags = [15, 30, 45, 60]

for name, df in imputed_dataframes.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_imputed', 'pivot_features')
    features_created_dataframes[new_name] = df_model

In [27]:
features_created_dataframes['bm_pivot_features'].head(10)

Unnamed: 0,target,dayofweek,quarter,month,year,dayofyear,lag_15,lag_30,lag_45,lag_60,covid
2021-01-01,3.6,4,1,1,2021,1,0.0,0.0,0.0,0.0,1
2021-01-02,7.0,5,1,1,2021,2,0.0,0.0,0.0,0.0,1
2021-01-03,0.0,6,1,1,2021,3,0.0,0.0,0.0,0.0,1
2021-01-04,17.045455,0,1,1,2021,4,0.0,0.0,0.0,0.0,1
2021-01-05,0.0,1,1,1,2021,5,0.0,0.0,0.0,0.0,1
2021-01-06,2.4,2,1,1,2021,6,0.0,0.0,0.0,0.0,1
2021-01-07,61.028124,3,1,1,2021,7,0.0,0.0,0.0,0.0,1
2021-01-08,148.922376,4,1,1,2021,8,0.0,0.0,0.0,0.0,1
2021-01-09,0.0,5,1,1,2021,9,0.0,0.0,0.0,0.0,1
2021-01-10,0.02,6,1,1,2021,10,0.0,0.0,0.0,0.0,1


## **Split Train and Test**

In [28]:
test_length = 15

full_sets = {}
train_sets = {}
test_sets = {}

for name, df in features_created_dataframes.items():
    df = df.sort_index()
    train_end = len(df) - test_length

    full = df
    train = df.iloc[:train_end]
    test = df.iloc[train_end:]

    full_sets[name] = full
    train_sets[name] = train
    test_sets[name] = test

    print(f"{name} - Full set length: {len(full)}")
    print(f"{name} - Train set length: {len(train)}")
    print(f"{name} - Test set length: {len(test)}\n")

bm_pivot_features - Full set length: 1155
bm_pivot_features - Train set length: 1140
bm_pivot_features - Test set length: 15

sm_pivot_features - Full set length: 1142
sm_pivot_features - Train set length: 1127
sm_pivot_features - Test set length: 15

ukm_pivot_features - Full set length: 1155
ukm_pivot_features - Train set length: 1140
ukm_pivot_features - Test set length: 15

cvm_pivot_features - Full set length: 1155
cvm_pivot_features - Train set length: 1140
cvm_pivot_features - Test set length: 15

cvf_pivot_features - Full set length: 1155
cvf_pivot_features - Train set length: 1140
cvf_pivot_features - Test set length: 15



In [29]:
def plot_train_test(train, test, title):
    trace_train = go.Scatter(x=train.index, y=train['target'], mode='lines', name='Train')
    trace_test = go.Scatter(x=test.index, y=test['target'], mode='lines', name='Test')

    layout = go.Layout(title=title, xaxis=dict(title='Date'), yaxis=dict(title='Target'))
    fig = go.Figure(data=[trace_train, trace_test], layout=layout)
    fig.show()

for name in features_created_dataframes.keys():
    train = train_sets[name]
    test = test_sets[name]
    plot_title = f'{name} - Train, and Test Data'
    plot_train_test(train, test, plot_title)

In [30]:
X_full_sets = {}
y_full_sets = {}

X_train_sets = {}
y_train_sets = {}

X_test_sets = {}
y_test_sets = {}

for name in features_created_dataframes.keys():
    X_full = full_sets[name].drop('target', axis=1)
    y_full = full_sets[name]['target']

    X_train = train_sets[name].drop('target', axis=1)
    y_train = train_sets[name]['target']

    X_test = test_sets[name].drop('target', axis=1)
    y_test = test_sets[name]['target']

    X_full_sets[name] = X_full
    y_full_sets[name] = y_full

    X_train_sets[name] = X_train
    y_train_sets[name] = y_train

    X_test_sets[name] = X_test
    y_test_sets[name] = y_test

# **Modelling**

## **XGBRegressor**

In [31]:
# param_dist = {
#     'n_estimators': randint(1000, 10001),
#     'learning_rate': uniform(0.01, 0.1),
#     'max_depth': randint(3, 10),
#     'min_child_weight': randint(1, 6),
#     'subsample': uniform(0, 1),
#     'reg_alpha': uniform(0, 1),
#     'reg_lambda': uniform(0.1, 9.9)
# }

# xgb_model = xgb.XGBRegressor(objective='reg:linear')

# random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
#                                    n_iter=2, scoring='neg_mean_squared_error',
#                                    cv=5, verbose=100, n_jobs=-1, random_state=44)

# random_search.fit(X_full, y_full)

# print("Best parameters found: ", random_search.best_params_)
# print("Best cross-validation RMSE: ", (-random_search.best_score_) ** 0.5)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters found:  {'learning_rate': 0.04937795510882514, 'max_depth': 9, 'min_child_weight': 2, 'n_estimators': 1579, 'reg_alpha': 0.20811897486840347, 'reg_lambda': 6.352058021877324, 'subsample': 0.7986246602807306}
Best cross-validation RMSE:  0.695397164596169

<!-- {
    'n_estimators': 10000,
    'learning_rate': 0.07,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'max_depth': 7,
    'min_child_weight': 1,
    'subsample': 0.9
} -->

In [32]:
reg_models = {}
reg_params = {
    'learning_rate': 0.05,
    'max_depth': 9,
    'min_child_weight': 2,
    'n_estimators': 10000,
    'reg_alpha': 0.2,
    'reg_lambda': 6.3,
    'subsample': 0.8
}

for name in features_created_dataframes.keys():
    X_train = X_train_sets[name]
    y_train = y_train_sets[name]
    X_test = X_test_sets[name]
    y_test = y_test_sets[name]

    reg = xgb.XGBRegressor(**reg_params)

    reg.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)],
            verbose=5000)

    reg_models[name] = reg

    print(f"Model trained for {name}")

[0]	validation_0-rmse:44.13087	validation_1-rmse:58.64488
[5000]	validation_0-rmse:0.01614	validation_1-rmse:54.62132
[9999]	validation_0-rmse:0.01268	validation_1-rmse:54.62214
Model trained for bm_pivot_features
[0]	validation_0-rmse:27.07063	validation_1-rmse:31.49577
[5000]	validation_0-rmse:0.01864	validation_1-rmse:31.66105
[9999]	validation_0-rmse:0.01424	validation_1-rmse:31.66131
Model trained for sm_pivot_features
[0]	validation_0-rmse:89.69124	validation_1-rmse:91.15389
[5000]	validation_0-rmse:0.01583	validation_1-rmse:83.61971
[9999]	validation_0-rmse:0.01193	validation_1-rmse:83.61942
Model trained for ukm_pivot_features
[0]	validation_0-rmse:4.17562	validation_1-rmse:4.06982
[5000]	validation_0-rmse:0.01369	validation_1-rmse:4.07586
[9999]	validation_0-rmse:0.01101	validation_1-rmse:4.07524
Model trained for cvm_pivot_features
[0]	validation_0-rmse:128.28962	validation_1-rmse:104.00960
[5000]	validation_0-rmse:0.01795	validation_1-rmse:107.62530
[9999]	validation_0-rmse:

In [33]:
reg_predictions = {}

for name in reg_models.keys():
    model = reg_models[name]

    X_test = X_test_sets[name]
    test = test_sets[name]

    reg_predicted = model.predict(X_test)
    reg_store_predicted = pd.DataFrame(reg_predicted, index=test.index, columns=['forecast'])

    reg_predictions[name] = reg_store_predicted

In [34]:
for name, df in reg_predictions.items():
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=(f"{name} Actual", f"{name} Predicted"))

    fig.add_trace(go.Scatter(x=test_sets[name].index, y=test_sets[name]['target'], mode='lines', name='Actual'), row=1, col=1)
    fig.add_trace(go.Scatter(x=reg_predictions[name].index, y=reg_predictions[name]['forecast'], mode='lines', name='Predicted'), row=1, col=1)

    fig.update_layout(height=500, title_text=f"XGB {name} Actual vs. Predicted", template='plotly_white')

    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=1, col=1)
    fig.update_yaxes(title_text="Value", row=2, col=1)

    fig.show()

In [35]:
for name, df in reg_predictions.items():
    fi = pd.DataFrame(data=reg.feature_importances_,
                      index=reg.feature_names_in_,
                      columns=['importance'])
    fi = fi.sort_values('importance')
    fig = px.bar(fi, x='importance', y=fi.index, orientation='h', title=f'{name} Feature Importance')

    fig.show()

In [36]:
def calculate_mape(actual, predicted):
    return np.mean(np.abs((actual - predicted) / actual)) * 100

results = []

for name, df in reg_predictions.items():
    actual = test_sets[name]['target']
    predicted = df['forecast']

    mape = calculate_mape(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)

    results.append({
        'Model': 'XGBRegressor',
        'Dataset': name,
        'MAE': mae,
        'MAPE': mape,
        'MSE': mse,
        'RMSE': rmse,
    })
xgb_results = pd.DataFrame(results)
xgb_average_metrics = xgb_results[['MAE', 'MAPE', 'MSE', 'RMSE']].mean().to_dict()
xgb_average_metrics['Model'] = 'XGBRegressor'
xgb_results = pd.DataFrame([xgb_average_metrics])

In [37]:
xgb_results

Unnamed: 0,MAE,MAPE,MSE,RMSE,Model
0,42.242751,41.864156,4515.618191,56.320732,XGBRegressor


## **HistGradientBoosting**

In [38]:
# param_dist = {
#     'learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'max_iter': randint(100, 10001),
#     'max_depth': randint(3, 20),
#     'min_samples_leaf': randint(1, 10),
#     'max_bins': randint(2, 255),
#     'l2_regularization': [0, 1e-1, 1],
#     'loss': ['least_squares', 'least_absolute_deviation', 'poisson']
# }

# hgb_model = HistGradientBoostingRegressor()

# random_search_hgb = RandomizedSearchCV(estimator=hgb_model, param_distributions=param_dist,
#                                    n_iter=2, scoring='neg_mean_squared_error',
#                                    cv=5, verbose=1, n_jobs=-1, random_state=42)

# random_search_hgb.fit(X_full, y_full)

# # Print the best parameters and the corresponding score
# print("Best parameters found: ", random_search_hgb.best_params_)
# print("Best cross-validation RMSE: ", (-random_search_hgb.best_score_) ** 0.5)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters found:  {'l2_regularization': 1, 'learning_rate': 0.05, 'loss': 'poisson', 'max_bins': 216, 'max_depth': 13, 'max_iter': 5678, 'min_samples_leaf': 8}
Best cross-validation RMSE:  19.006219810105655

In [39]:
models = {}
hgb_params = {
        'max_iter':5000,
        'l2_regularization':1,
        'learning_rate':0.05,
        'loss':'poisson',
        'max_bins':216,
        'max_depth':13,
        'min_samples_leaf':8,
        'verbose':1
}

for name in features_created_dataframes.keys():
    X_train = X_train_sets[name]
    y_train = y_train_sets[name]
    X_test = X_test_sets[name]
    y_test = y_test_sets[name]

    hgb = HistGradientBoostingRegressor(**hgb_params)

    hgb.fit(X_train, y_train)

    models[name] = hgb

    print(f"Model trained for {name}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[7/5000] 1 tree, 31 leaves, max depth = 13, in 0.008s
[8/5000] 1 tree, 31 leaves, max depth = 9, in 0.021s
[9/5000] 1 tree, 31 leaves, max depth = 10, in 0.009s
[10/5000] 1 tree, 31 leaves, max depth = 13, in 0.015s
[11/5000] 1 tree, 31 leaves, max depth = 10, in 0.017s
[12/5000] 1 tree, 31 leaves, max depth = 10, in 0.006s
[13/5000] 1 tree, 31 leaves, max depth = 13, in 0.008s
[14/5000] 1 tree, 31 leaves, max depth = 11, in 0.031s
[15/5000] 1 tree, 31 leaves, max depth = 10, in 0.017s
[16/5000] 1 tree, 31 leaves, max depth = 10, in 0.026s
[17/5000] 1 tree, 31 leaves, max depth = 12, in 0.010s
[18/5000] 1 tree, 31 leaves, max depth = 9, in 0.031s
[19/5000] 1 tree, 31 leaves, max depth = 10, in 0.017s
[20/5000] 1 tree, 31 leaves, max depth = 9, in 0.487s
[21/5000] 1 tree, 31 leaves, max depth = 10, in 0.005s
[22/5000] 1 tree, 31 leaves, max depth = 11, in 0.020s
[23/5000] 1 tree, 31 leaves, max depth = 12, in 0.012s
[24/50

In [40]:
hgb_predictions = {}

for name in models.keys():
    model = models[name]

    X_test = X_test_sets[name]
    test = test_sets[name]

    hgb_predicted = model.predict(X_test)
    hgb_store_predicted = pd.DataFrame(hgb_predicted, index=test.index, columns=['forecast'])

    hgb_predictions[name] = hgb_store_predicted

In [41]:
for name, df in hgb_predictions.items():
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=(f"{name} Actual", f"{name} Predicted"))

    fig.add_trace(go.Scatter(x=test_sets[name].index, y=test_sets[name]['target'], mode='lines', name='Actual'), row=1, col=1)
    fig.add_trace(go.Scatter(x=hgb_predictions[name].index, y=hgb_predictions[name]['forecast'], mode='lines', name='Predicted'), row=1, col=1)

    fig.update_layout(height=500, title_text=f"HistGradient {name} Actual vs. Predicted", template='plotly_white')

    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=1, col=1)
    fig.update_yaxes(title_text="Value", row=2, col=1)

    fig.show()

In [42]:
hgb_results = []

for name, df in hgb_predictions.items():
    actual = test_sets[name]['target']
    predicted = df['forecast']

    mape = calculate_mape(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)

    hgb_results.append({
        'Model': 'HistGradientBoost',
        'Dataset': name,
        'MAE': mae,
        'MAPE': mape,
        'MSE': mse,
        'RMSE': rmse,
    })
hgb_results_df = pd.DataFrame(hgb_results)
hgb_average_metrics = hgb_results_df[['MAE', 'MAPE', 'MSE', 'RMSE']].mean().to_dict()
hgb_average_metrics['Model'] = 'HistGradientBoost'
hgb_results = pd.DataFrame([hgb_average_metrics])

In [43]:
hgb_results

Unnamed: 0,MAE,MAPE,MSE,RMSE,Model
0,50.379407,39.588843,6832.447496,67.582202,HistGradientBoost


## **LightGBM**

In [44]:
# param_dist = {
#     'learning_rate': uniform(0.01, 0.5),
#     'n_estimators': randint(100, 1000),
#     'num_leaves': randint(3, 20),
#     'min_data_in_leaf': randint(1, 20),
#     'max_depth': randint(3, 10),
#     'min_split_gain': uniform(0, 1),
#     'feature_fraction': uniform(0.6, 1),
#     'bagging_fraction': uniform(0.6, 1),
#     'bagging_freq': randint(1, 5),
#     'lambda_l1': uniform(0, 1),
#     'lambda_l2': uniform(0, 1)
# }

# lgbm_model = lgb.LGBMRegressor()

# random_search_lgbm = RandomizedSearchCV(estimator=lgbm_model, param_distributions=param_dist,
#                                          n_iter=2, scoring='neg_mean_squared_error',  # Minimize MSE
#                                          cv=5, verbose=1, n_jobs=-1, random_state=42)

# random_search_lgbm.fit(X_full, y_full)

# print("Best parameters found: ", random_search_lgbm.best_params_)
# print("Best cross-validation RMSE: ", (-random_search_lgbm.best_score_) ** 0.5)

Best parameters found:  {'bagging_fraction': 0.7428668179219408, 'bagging_freq': 3, 'feature_fraction': 0.6205844942958024, 'lambda_l1': 0.9699098521619943, 'lambda_l2': 0.8324426408004217, 'learning_rate': 0.11616955533913807, 'max_depth': 6, 'min_data_in_leaf': 1, 'min_split_gain': 0.3042422429595377, 'n_estimators': 121, 'num_leaves': 14}
Best cross-validation RMSE:  19.218964602514756

In [45]:
lgbm_models = {}
lgbm_params = {
        'bagging_fraction': 0.7,
        'bagging_freq': 3,
        'feature_fraction': 0.6,
        'lambda_l1': 0.9,
        'lambda_l2': 0.8,
        'learning_rate': 0.1,
        'max_depth': 6,
        'min_data_in_leaf': 1,
        'min_split_gain': 0.3,
        'n_estimators': 121,
        'num_leaves': 14
}

for name in features_created_dataframes.keys():
    X_train = X_train_sets[name]
    y_train = y_train_sets[name]
    X_test = X_test_sets[name]
    y_test = y_test_sets[name]

    lgbm = lgb.LGBMRegressor(**lgbm_params)

    lgbm.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_test, y_test)])

    lgbm_models[name] = lgbm

    print(f"Model trained for {name}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000438 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 1140, number of used features: 10
[LightGBM] [Info] Start training from score 45.880759
Model trained for bm_pivot_features
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the train set: 1127, number of used features: 10
[LightGBM] [Info] Start training from score 117.515549
Model trained for sm_pivot_features
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1307
[LightGBM] [Info] Number of data points in the 

In [46]:
lgbm_predictions = {}

for name in lgbm_models.keys():
    model = lgbm_models[name]

    X_test = X_test_sets[name]
    test = test_sets[name]

    lgbm_predicted = model.predict(X_test)
    lgbm_store_predicted = pd.DataFrame(lgbm_predicted, index=test.index, columns=['forecast'])

    lgbm_predictions[name] = lgbm_store_predicted



In [47]:
for name, df in lgbm_predictions.items():
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=(f"{name} Actual", f"{name} Predicted"))

    fig.add_trace(go.Scatter(x=test_sets[name].index, y=test_sets[name]['target'], mode='lines', name='Actual'), row=1, col=1)
    fig.add_trace(go.Scatter(x=lgbm_predictions[name].index, y=lgbm_predictions[name]['forecast'], mode='lines', name='Predicted'), row=1, col=1)

    fig.update_layout(height=500, title_text=f"LGBM {name} Actual vs. Predicted", template='plotly_white')

    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=1, col=1)
    fig.update_yaxes(title_text="Value", row=2, col=1)

    fig.show()

In [48]:
lgbm_results = []

for name, df in lgbm_predictions.items():
    actual = test_sets[name]['target']
    predicted = df['forecast']

    mape = calculate_mape(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)

    lgbm_results.append({
        'Model': 'LightGBM',
        'Dataset': name,
        'MAE': mae,
        'MAPE': mape,
        'MSE': mse,
        'RMSE': rmse,
    })
lgbm_results_df = pd.DataFrame(lgbm_results)
lgbm_average_metrics = lgbm_results_df[['MAE', 'MAPE', 'MSE', 'RMSE']].mean().to_dict()
lgbm_average_metrics['Model'] = 'LightGBM'
lgbm_results = pd.DataFrame([lgbm_average_metrics])

In [49]:
lgbm_results

Unnamed: 0,MAE,MAPE,MSE,RMSE,Model
0,45.977778,40.741373,4748.450729,57.489708,LightGBM


## **RandomForest**

In [50]:
# param_dist = {
#     'n_estimators': randint(100, 10001),
#     'max_depth': randint(3, 20),
#     'min_samples_split': randint(2, 20),
#     'min_samples_leaf': randint(1, 10),
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'bootstrap': [True, False]
# }

# rfr_model = RandomForestRegressor()

# random_search_rfr = RandomizedSearchCV(estimator=rfr_model, param_distributions=param_dist,
#                                    n_iter=2, scoring='neg_mean_squared_error',
#                                    cv=5, verbose=1, n_jobs=-1, random_state=42)

# random_search_rfr.fit(X_full, y_full)

# # Print the best parameters and the corresponding score
# print("Best parameters found: ", random_search_rfr.best_params_)
# print("Best cross-validation RMSE: ", (-random_search_rfr.best_score_) ** 0.5)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters found:  {
    'bootstrap': True,
    'max_depth': 13,
    'max_features': 'log2',
    'min_samples_leaf': 8,
    'min_samples_split': 5,
    'n_estimators': 8422}
Best cross-validation RMSE:  0.6512033302030092

In [51]:
rfr_models = {}
rfr_params = {
    'bootstrap': True,
    'max_depth': 13,
    'max_features': 'log2',
    'min_samples_leaf': 8,
    'min_samples_split': 5,
    'n_estimators': 8422
}

for name in features_created_dataframes.keys():
    X_train = X_train_sets[name]
    y_train = y_train_sets[name]
    X_test = X_test_sets[name]
    y_test = y_test_sets[name]

    rfr = RandomForestRegressor(**rfr_params)

    rfr.fit(X_train, y_train)

    rfr_models[name] = rfr

    print(f"Model trained for {name}")

Model trained for bm_pivot_features
Model trained for sm_pivot_features
Model trained for ukm_pivot_features
Model trained for cvm_pivot_features
Model trained for cvf_pivot_features


In [52]:
rfr_predictions = {}

for name in rfr_models.keys():
    model = rfr_models[name]

    X_test = X_test_sets[name]
    test = test_sets[name]

    rfr_predicted = model.predict(X_test)
    rfr_store_predicted = pd.DataFrame(rfr_predicted, index=test.index, columns=['forecast'])

    rfr_predictions[name] = rfr_store_predicted

In [53]:
for name, df in rfr_predictions.items():
    fig = make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.1,
                        subplot_titles=(f"{name} Actual", f"{name} Predicted"))

    fig.add_trace(go.Scatter(x=test_sets[name].index, y=test_sets[name]['target'], mode='lines', name='Actual'), row=1, col=1)
    fig.add_trace(go.Scatter(x=rfr_predictions[name].index, y=rfr_predictions[name]['forecast'], mode='lines', name='Predicted'), row=1, col=1)

    fig.update_layout(height=500, title_text=f"LGBM {name} Actual vs. Predicted", template='plotly_white')

    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Value", row=1, col=1)
    fig.update_yaxes(title_text="Value", row=2, col=1)

    fig.show()

In [54]:
for name, df in rfr_predictions.items():
    fi = pd.DataFrame(data=rfr.feature_importances_,
                      index=rfr.feature_names_in_,
                      columns=['importance'])
    fi = fi.sort_values('importance')
    fig = px.bar(fi, x='importance', y=fi.index, orientation='h', title=f'{name} Feature Importance')

    fig.show()

In [55]:
rfr_results = []

for name, df in rfr_predictions.items():
    actual = test_sets[name]['target']
    predicted = df['forecast']

    mape = calculate_mape(actual, predicted)
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)

    rfr_results.append({
        'Model': 'RandomForestRegressor',
        'Dataset': name,
        'MAE': mae,
        'MAPE': mape,
        'MSE': mse,
        'RMSE': rmse,
    })
rfr_results_df = pd.DataFrame(rfr_results)
rfr_average_metrics = rfr_results_df[['MAE', 'MAPE', 'MSE', 'RMSE']].mean().to_dict()
rfr_average_metrics['Model'] = 'RandomForestRegressor'
rfr_results = pd.DataFrame([rfr_average_metrics])

In [56]:
rfr_results

Unnamed: 0,MAE,MAPE,MSE,RMSE,Model
0,43.426075,43.395131,4544.183519,55.836629,RandomForestRegressor


## **Models Evaluation**

In [57]:
results = pd.concat([xgb_results, hgb_results, lgbm_results, rfr_results])

In [58]:
results

Unnamed: 0,MAE,MAPE,MSE,RMSE,Model
0,42.242751,41.864156,4515.618191,56.320732,XGBRegressor
0,50.379407,39.588843,6832.447496,67.582202,HistGradientBoost
0,45.977778,40.741373,4748.450729,57.489708,LightGBM
0,43.426075,43.395131,4544.183519,55.836629,RandomForestRegressor


According to the average results, XGBRegressor came out as the best performing model. Therefore it will be selected as the main model.

# **Selected Model: XGBRegressor**

## **Repeat All Steps To All Kategori**

- kategori_health_products
- kategori_baby_products
- kategori_bakery
- kategori_drinks
- kategori_food_cupboard
- kategori_free_from
- kategori_fresh_food
- kategori_frozen
- kategori_home
- kategori_household
- kategori_pets

In [59]:
categories = {
    'health': 'kategori_health_products',
    'baby': 'kategori_baby_products',
    'bakery': 'kategori_bakery',
    'drinks': 'kategori_drinks',
    'food': 'kategori_food_cupboard',
    'free': 'kategori_free_from',
    'fresh': 'kategori_fresh_food',
    'frozen': 'kategori_frozen',
    'home': 'kategori_home',
    'household': 'kategori_household',
    'pets': 'kategori_pets'
}

dfs = {
    'bm': bm_pivot,
    'sm': sm_pivot,
    'ukm': ukm_pivot,
    'cvm': cvm_pivot,
    'cvf': cvf_pivot
}

health_dataframes = {}
baby_dataframes = {}
bakery_dataframes = {}
drinks_dataframes = {}
food_dataframes = {}
free_dataframes = {}
fresh_dataframes = {}
frozen_dataframes = {}
home_dataframes = {}
household_dataframes = {}
pets_dataframes = {}

category_dataframes = {
    'health': health_dataframes,
    'baby': baby_dataframes,
    'bakery': bakery_dataframes,
    'drinks': drinks_dataframes,
    'food': food_dataframes,
    'free': free_dataframes,
    'fresh': fresh_dataframes,
    'frozen': frozen_dataframes,
    'home': home_dataframes,
    'household': household_dataframes,
    'pets': pets_dataframes
}

for category, column_name in categories.items():
    for df_name, df in dfs.items():
        if (category == 'home' and df_name == 'cvm') or (category == 'free' and df_name == 'sm'):
            continue
        selected_df = select_and_rename_target_column(df, column_name)
        category_dataframes[category][f'{df_name}_pivot_{category}'] = selected_df

In [60]:
for name, df in health_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    health_dataframes[name] = df_handled

for name, df in baby_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    baby_dataframes[name] = df_handled

for name, df in bakery_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    bakery_dataframes[name] = df_handled

for name, df in drinks_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    drinks_dataframes[name] = df_handled

for name, df in food_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    food_dataframes[name] = df_handled

for name, df in free_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    free_dataframes[name] = df_handled

for name, df in fresh_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    fresh_dataframes[name] = df_handled

for name, df in frozen_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    frozen_dataframes[name] = df_handled

for name, df in home_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    home_dataframes[name] = df_handled

for name, df in household_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    household_dataframes[name] = df_handled

for name, df in pets_dataframes.items():
    df_handled, original_len, handled_len = handle_outliers(df, 'target')
    outliers_count = original_len - handled_len
    pets_dataframes[name] = df_handled

In [61]:
health_imputed = {}
for name, df in health_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_health', 'pivot_health_imputed')
    health_imputed[new_name] = df_imputed

baby_imputed = {}
for name, df in baby_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_baby', 'pivot_baby_imputed')
    baby_imputed[new_name] = df_imputed

bakery_imputed = {}
for name, df in bakery_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_bakery', 'pivot_bakery_imputed')
    bakery_imputed[new_name] = df_imputed

drinks_imputed = {}
for name, df in drinks_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_drinks', 'pivot_drinks_imputed')
    drinks_imputed[new_name] = df_imputed

food_imputed = {}
for name, df in food_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_food', 'pivot_food_imputed')
    food_imputed[new_name] = df_imputed

free_imputed = {}
for name, df in free_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_free', 'pivot_free_imputed')
    free_imputed[new_name] = df_imputed

fresh_imputed = {}
for name, df in fresh_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_fresh', 'pivot_fresh_imputed')
    fresh_imputed[new_name] = df_imputed

frozen_imputed = {}
for name, df in frozen_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_frozen', 'pivot_frozen_imputed')
    frozen_imputed[new_name] = df_imputed

home_imputed = {}
for name, df in home_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_home', 'pivot_home_imputed')
    home_imputed[new_name] = df_imputed

household_imputed = {}
for name, df in household_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_household', 'pivot_household_imputed')
    household_imputed[new_name] = df_imputed

pets_imputed = {}
for name, df in pets_dataframes.items():
    df_filled = fill_missing_dates(df)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_filled), columns=df_filled.columns, index=df_filled.index)
    new_name = name.replace('pivot_pets', 'pivot_pets_imputed')
    pets_imputed[new_name] = df_imputed

In [62]:
health_features_created = {}
for name, df in health_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_health_imputed', 'pivot_health_features')
    health_features_created[new_name] = df_model

baby_features_created = {}
for name, df in baby_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_baby_imputed', 'pivot_baby_features')
    baby_features_created[new_name] = df_model

bakery_features_created = {}
for name, df in bakery_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_bakery_imputed', 'pivot_bakery_features')
    bakery_features_created[new_name] = df_model

drinks_features_created = {}
for name, df in drinks_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_drinks_imputed', 'pivot_drinks_features')
    drinks_features_created[new_name] = df_model

food_features_created = {}
for name, df in food_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_food_imputed', 'pivot_food_features')
    food_features_created[new_name] = df_model

free_features_created = {}
for name, df in free_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_free_imputed', 'pivot_free_features')
    free_features_created[new_name] = df_model

fresh_features_created = {}
for name, df in fresh_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_fresh_imputed', 'pivot_fresh_features')
    fresh_features_created[new_name] = df_model

frozen_features_created = {}
for name, df in frozen_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_frozen_imputed', 'pivot_frozen_features')
    frozen_features_created[new_name] = df_model

home_features_created = {}
for name, df in home_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_home_imputed', 'pivot_home_features')
    home_features_created[new_name] = df_model

household_features_created = {}
for name, df in household_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_house_imputed', 'pivot_house_features')
    household_features_created[new_name] = df_model

pets_features_created = {}
for name, df in pets_imputed.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    new_name = name.replace('pivot_pets_imputed', 'pivot_pets_features')
    pets_features_created[new_name] = df_model

In [63]:
full_health_sets = {}
X_full_health_sets = {}
y_full_health_sets = {}
for name, df in health_features_created.items():
    df = df.sort_index()
    full_health = df
    full_health_sets[name] = full_health
for name in health_features_created.keys():
    X_full_health = full_health_sets[name].drop('target', axis=1)
    y_full_health = full_health_sets[name]['target']
    X_full_health_sets[name] = X_full_health
    y_full_health_sets[name] = y_full_health

full_baby_sets = {}
X_full_baby_sets = {}
y_full_baby_sets = {}
for name, df in baby_features_created.items():
    df = df.sort_index()
    full_baby = df
    full_baby_sets[name] = full_baby
for name in baby_features_created.keys():
    X_full_baby = full_baby_sets[name].drop('target', axis=1)
    y_full_baby = full_baby_sets[name]['target']
    X_full_baby_sets[name] = X_full_baby
    y_full_baby_sets[name] = y_full_baby

full_bakery_sets = {}
X_full_bakery_sets = {}
y_full_bakery_sets = {}
for name, df in bakery_features_created.items():
    df = df.sort_index()
    full_bakery = df
    full_bakery_sets[name] = full_bakery
for name in bakery_features_created.keys():
    X_full_bakery = full_bakery_sets[name].drop('target', axis=1)
    y_full_bakery = full_bakery_sets[name]['target']
    X_full_bakery_sets[name] = X_full_bakery
    y_full_bakery_sets[name] = y_full_bakery

full_drinks_sets = {}
X_full_drinks_sets = {}
y_full_drinks_sets = {}
for name, df in drinks_features_created.items():
    df = df.sort_index()
    full_drinks = df
    full_drinks_sets[name] = full_drinks
for name in drinks_features_created.keys():
    X_full_drinks = full_drinks_sets[name].drop('target', axis=1)
    y_full_drinks = full_drinks_sets[name]['target']
    X_full_drinks_sets[name] = X_full_drinks
    y_full_drinks_sets[name] = y_full_drinks

full_food_sets = {}
X_full_food_sets = {}
y_full_food_sets = {}
for name, df in food_features_created.items():
    df = df.sort_index()
    full_food = df
    full_food_sets[name] = full_food
for name in food_features_created.keys():
    X_full_food = full_food_sets[name].drop('target', axis=1)
    y_full_food = full_food_sets[name]['target']
    X_full_food_sets[name] = X_full_food
    y_full_food_sets[name] = y_full_food

full_free_sets = {}
X_full_free_sets = {}
y_full_free_sets = {}
for name, df in free_features_created.items():
    df = df.sort_index()
    full_free = df
    full_free_sets[name] = full_free
for name in free_features_created.keys():
    X_full_free = full_free_sets[name].drop('target', axis=1)
    y_full_free = full_free_sets[name]['target']
    X_full_free_sets[name] = X_full_free
    y_full_free_sets[name] = y_full_free

full_fresh_sets = {}
X_full_fresh_sets = {}
y_full_fresh_sets = {}
for name, df in fresh_features_created.items():
    df = df.sort_index()
    full_fresh = df
    full_fresh_sets[name] = full_fresh
for name in fresh_features_created.keys():
    X_full_fresh = full_fresh_sets[name].drop('target', axis=1)
    y_full_fresh = full_fresh_sets[name]['target']
    X_full_fresh_sets[name] = X_full_fresh
    y_full_fresh_sets[name] = y_full_fresh

full_frozen_sets = {}
X_full_frozen_sets = {}
y_full_frozen_sets = {}
for name, df in frozen_features_created.items():
    df = df.sort_index()
    full_frozen = df
    full_frozen_sets[name] = full_frozen
for name in frozen_features_created.keys():
    X_full_frozen = full_frozen_sets[name].drop('target', axis=1)
    y_full_frozen = full_frozen_sets[name]['target']
    X_full_frozen_sets[name] = X_full_frozen
    y_full_frozen_sets[name] = y_full_frozen

full_home_sets = {}
X_full_home_sets = {}
y_full_home_sets = {}
for name, df in home_features_created.items():
    df = df.sort_index()
    full_home = df
    full_home_sets[name] = full_home
for name in home_features_created.keys():
    X_full_home = full_home_sets[name].drop('target', axis=1)
    y_full_home = full_home_sets[name]['target']
    X_full_home_sets[name] = X_full_home
    y_full_home_sets[name] = y_full_home

full_household_sets = {}
X_full_household_sets = {}
y_full_household_sets = {}
for name, df in household_features_created.items():
    df = df.sort_index()
    full_household = df
    full_household_sets[name] = full_household
for name in household_features_created.keys():
    X_full_household = full_household_sets[name].drop('target', axis=1)
    y_full_household = full_household_sets[name]['target']
    X_full_household_sets[name] = X_full_household
    y_full_household_sets[name] = y_full_household

full_pets_sets = {}
X_full_pets_sets = {}
y_full_pets_sets = {}
for name, df in pets_features_created.items():
    df = df.sort_index()
    full_pets = df
    full_pets_sets[name] = full_pets
for name in pets_features_created.keys():
    X_full_pets = full_pets_sets[name].drop('target', axis=1)
    y_full_pets = full_pets_sets[name]['target']
    X_full_pets_sets[name] = X_full_pets
    y_full_pets_sets[name] = y_full_pets

## **Train Full Data**

In [64]:
health_models = {}
health_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_health_sets.keys():
    X_full_health = X_full_health_sets[name]
    y_full_health = y_full_health_sets[name]

    health = xgb.XGBRegressor(**health_params)

    health.fit(X_full_health, y_full_health,
               eval_set=[(X_full_health, y_full_health)],
               verbose=100)

    health_models[name] = health

    print(f"Model trained for {name}")

[0]	validation_0-rmse:43.94317
[100]	validation_0-rmse:16.85307
[149]	validation_0-rmse:12.50487
Model trained for bm_pivot_health_features
[0]	validation_0-rmse:27.05792
[100]	validation_0-rmse:14.86573
[149]	validation_0-rmse:11.64522
Model trained for sm_pivot_health_features
[0]	validation_0-rmse:89.08796
[100]	validation_0-rmse:40.17744
[149]	validation_0-rmse:30.25302
Model trained for ukm_pivot_health_features
[0]	validation_0-rmse:4.17325
[100]	validation_0-rmse:2.01035
[149]	validation_0-rmse:1.51371
Model trained for cvm_pivot_health_features
[0]	validation_0-rmse:126.67994
[100]	validation_0-rmse:51.87245
[149]	validation_0-rmse:37.99335
Model trained for cvf_pivot_health_features


In [65]:
baby_models = {}
baby_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_baby_sets.keys():
    X_full_baby = X_full_baby_sets[name]
    y_full_baby = y_full_baby_sets[name]

    baby = xgb.XGBRegressor(**baby_params)

    baby.fit(X_full_baby, y_full_baby,
               eval_set=[(X_full_baby, y_full_baby)],
               verbose=100)

    baby_models[name] = baby

    print(f"Model trained for {name}")

[0]	validation_0-rmse:12.18779
[100]	validation_0-rmse:5.01613
[149]	validation_0-rmse:4.06101
Model trained for bm_pivot_baby_features
[0]	validation_0-rmse:6.00909
[100]	validation_0-rmse:2.81898
[149]	validation_0-rmse:2.07992
Model trained for sm_pivot_baby_features
[0]	validation_0-rmse:6.37417
[100]	validation_0-rmse:2.60733
[149]	validation_0-rmse:1.92641
Model trained for ukm_pivot_baby_features
[0]	validation_0-rmse:3.20475
[100]	validation_0-rmse:1.58818
[149]	validation_0-rmse:1.18374
Model trained for cvm_pivot_baby_features
[0]	validation_0-rmse:6.82009
[100]	validation_0-rmse:2.69258
[149]	validation_0-rmse:2.08496
Model trained for cvf_pivot_baby_features


In [66]:
bakery_models = {}
bakery_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_bakery_sets.keys():
    X_full_bakery = X_full_bakery_sets[name]
    y_full_bakery = y_full_bakery_sets[name]

    bakery = xgb.XGBRegressor(**bakery_params)

    bakery.fit(X_full_bakery, y_full_bakery,
               eval_set=[(X_full_bakery, y_full_bakery)],
               verbose=100)

    bakery_models[name] = bakery

    print(f"Model trained for {name}")

[0]	validation_0-rmse:1.45297
[100]	validation_0-rmse:0.75055
[149]	validation_0-rmse:0.61973
Model trained for bm_pivot_bakery_features
[0]	validation_0-rmse:1.09888
[100]	validation_0-rmse:0.55465
[149]	validation_0-rmse:0.43455
Model trained for sm_pivot_bakery_features
[0]	validation_0-rmse:1.06561
[100]	validation_0-rmse:0.55332
[149]	validation_0-rmse:0.43838
Model trained for ukm_pivot_bakery_features
[0]	validation_0-rmse:0.80828
[100]	validation_0-rmse:0.41539
[149]	validation_0-rmse:0.30511
Model trained for cvm_pivot_bakery_features
[0]	validation_0-rmse:1.02009
[100]	validation_0-rmse:0.51295
[149]	validation_0-rmse:0.40107
Model trained for cvf_pivot_bakery_features


In [67]:
drinks_models = {}
drinks_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_drinks_sets.keys():
    X_full_drinks = X_full_drinks_sets[name]
    y_full_drinks = y_full_drinks_sets[name]

    drinks = xgb.XGBRegressor(**drinks_params)

    drinks.fit(X_full_drinks, y_full_drinks,
               eval_set=[(X_full_drinks, y_full_drinks)],
               verbose=100)

    drinks_models[name] = drinks

    print(f"Model trained for {name}")

[0]	validation_0-rmse:5.27711
[100]	validation_0-rmse:2.18771
[149]	validation_0-rmse:1.62045
Model trained for bm_pivot_drinks_features
[0]	validation_0-rmse:2.55081
[100]	validation_0-rmse:1.31028
[149]	validation_0-rmse:1.01257
Model trained for sm_pivot_drinks_features
[0]	validation_0-rmse:1.61491
[100]	validation_0-rmse:0.86947
[149]	validation_0-rmse:0.68259
Model trained for ukm_pivot_drinks_features
[0]	validation_0-rmse:1.44345
[100]	validation_0-rmse:0.70334
[149]	validation_0-rmse:0.53812
Model trained for cvm_pivot_drinks_features
[0]	validation_0-rmse:7.13773
[100]	validation_0-rmse:3.41201
[149]	validation_0-rmse:2.61956
Model trained for cvf_pivot_drinks_features


In [68]:
food_models = {}
food_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_food_sets.keys():
    X_full_food = X_full_food_sets[name]
    y_full_food = y_full_food_sets[name]

    food = xgb.XGBRegressor(**food_params)

    food.fit(X_full_food, y_full_food,
               eval_set=[(X_full_food, y_full_food)],
               verbose=100)

    food_models[name] = food

    print(f"Model trained for {name}")

[0]	validation_0-rmse:5.56386
[100]	validation_0-rmse:2.37812
[149]	validation_0-rmse:1.80556
Model trained for bm_pivot_food_features
[0]	validation_0-rmse:3.91211
[100]	validation_0-rmse:1.82059
[149]	validation_0-rmse:1.33196
Model trained for sm_pivot_food_features
[0]	validation_0-rmse:1.78280
[100]	validation_0-rmse:1.04473
[149]	validation_0-rmse:0.83536
Model trained for ukm_pivot_food_features
[0]	validation_0-rmse:0.96429
[100]	validation_0-rmse:0.47886
[149]	validation_0-rmse:0.36681
Model trained for cvm_pivot_food_features
[0]	validation_0-rmse:2.08958
[100]	validation_0-rmse:1.05283
[149]	validation_0-rmse:0.82252
Model trained for cvf_pivot_food_features


In [69]:
free_models = {}
free_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_free_sets.keys():
    X_full_free = X_full_free_sets[name]
    y_full_free = y_full_free_sets[name]

    free = xgb.XGBRegressor(**free_params)

    free.fit(X_full_free, y_full_free,
               eval_set=[(X_full_free, y_full_free)],
               verbose=100)

    free_models[name] = free

    print(f"Model trained for {name}")

[0]	validation_0-rmse:5.83683
[100]	validation_0-rmse:2.46543
[149]	validation_0-rmse:1.92225
Model trained for bm_pivot_free_features
[0]	validation_0-rmse:4.33811
[100]	validation_0-rmse:2.15585
[149]	validation_0-rmse:1.65884
Model trained for ukm_pivot_free_features
[0]	validation_0-rmse:4.65086
[100]	validation_0-rmse:2.30429
[149]	validation_0-rmse:1.70859
Model trained for cvm_pivot_free_features
[0]	validation_0-rmse:2.08104
[100]	validation_0-rmse:1.03275
[149]	validation_0-rmse:0.80718
Model trained for cvf_pivot_free_features


In [70]:
fresh_models = {}
fresh_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_fresh_sets.keys():
    X_full_fresh = X_full_fresh_sets[name]
    y_full_fresh = y_full_fresh_sets[name]

    fresh = xgb.XGBRegressor(**fresh_params)

    fresh.fit(X_full_fresh, y_full_fresh,
               eval_set=[(X_full_fresh, y_full_fresh)],
               verbose=100)

    fresh_models[name] = fresh

    print(f"Model trained for {name}")

[0]	validation_0-rmse:4.02851
[100]	validation_0-rmse:1.53814
[149]	validation_0-rmse:1.15073
Model trained for bm_pivot_fresh_features
[0]	validation_0-rmse:0.65287
[100]	validation_0-rmse:0.34456
[149]	validation_0-rmse:0.27425
Model trained for sm_pivot_fresh_features
[0]	validation_0-rmse:0.79608
[100]	validation_0-rmse:0.45552
[149]	validation_0-rmse:0.36304
Model trained for ukm_pivot_fresh_features
[0]	validation_0-rmse:0.71619
[100]	validation_0-rmse:0.37527
[149]	validation_0-rmse:0.29636
Model trained for cvm_pivot_fresh_features
[0]	validation_0-rmse:1.15132
[100]	validation_0-rmse:0.59656
[149]	validation_0-rmse:0.46752
Model trained for cvf_pivot_fresh_features


In [71]:
frozen_models = {}
frozen_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_frozen_sets.keys():
    X_full_frozen = X_full_frozen_sets[name]
    y_full_frozen = y_full_frozen_sets[name]

    frozen = xgb.XGBRegressor(**frozen_params)

    frozen.fit(X_full_frozen, y_full_frozen,
               eval_set=[(X_full_frozen, y_full_frozen)],
               verbose=100)

    frozen_models[name] = frozen

    print(f"Model trained for {name}")

[0]	validation_0-rmse:4.00530
[100]	validation_0-rmse:1.59245
[149]	validation_0-rmse:1.22788
Model trained for bm_pivot_frozen_features
[0]	validation_0-rmse:0.79020
[100]	validation_0-rmse:0.43460
[149]	validation_0-rmse:0.33488
Model trained for sm_pivot_frozen_features
[0]	validation_0-rmse:0.94177
[100]	validation_0-rmse:0.48144
[149]	validation_0-rmse:0.37086
Model trained for ukm_pivot_frozen_features
[0]	validation_0-rmse:0.70010
[100]	validation_0-rmse:0.34651
[149]	validation_0-rmse:0.26531
Model trained for cvm_pivot_frozen_features
[0]	validation_0-rmse:0.89938
[100]	validation_0-rmse:0.48460
[149]	validation_0-rmse:0.38626
Model trained for cvf_pivot_frozen_features


In [72]:
home_models = {}
home_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_home_sets.keys():
    X_full_home = X_full_home_sets[name]
    y_full_home = y_full_home_sets[name]

    home = xgb.XGBRegressor(**home_params)

    home.fit(X_full_home, y_full_home,
               eval_set=[(X_full_home, y_full_home)],
               verbose=100)

    home_models[name] = home

    print(f"Model trained for {name}")

[0]	validation_0-rmse:4.45709
[100]	validation_0-rmse:2.13101
[149]	validation_0-rmse:1.53215
Model trained for bm_pivot_home_features
[0]	validation_0-rmse:12.44778
[100]	validation_0-rmse:5.76104
[149]	validation_0-rmse:4.38234
Model trained for sm_pivot_home_features
[0]	validation_0-rmse:3.11072
[100]	validation_0-rmse:1.68455
[149]	validation_0-rmse:1.24927
Model trained for ukm_pivot_home_features
[0]	validation_0-rmse:7.82847
[100]	validation_0-rmse:3.50597
[149]	validation_0-rmse:2.66042
Model trained for cvf_pivot_home_features


In [73]:
household_models = {}
household_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_household_sets.keys():
    X_full_household = X_full_household_sets[name]
    y_full_household = y_full_household_sets[name]

    household = xgb.XGBRegressor(**household_params)

    household.fit(X_full_household, y_full_household,
               eval_set=[(X_full_household, y_full_household)],
               verbose=100)

    household_models[name] = household

    print(f"Model trained for {name}")

[0]	validation_0-rmse:13.05192
[100]	validation_0-rmse:5.25959
[149]	validation_0-rmse:3.99048
Model trained for bm_pivot_household_imputed
[0]	validation_0-rmse:12.38166
[100]	validation_0-rmse:5.32514
[149]	validation_0-rmse:4.09348
Model trained for sm_pivot_household_imputed
[0]	validation_0-rmse:9.41244
[100]	validation_0-rmse:4.19443
[149]	validation_0-rmse:3.13740
Model trained for ukm_pivot_household_imputed
[0]	validation_0-rmse:2.73193
[100]	validation_0-rmse:1.32831
[149]	validation_0-rmse:0.97075
Model trained for cvm_pivot_household_imputed
[0]	validation_0-rmse:4.84786
[100]	validation_0-rmse:2.35226
[149]	validation_0-rmse:1.77605
Model trained for cvf_pivot_household_imputed


In [74]:
pets_models = {}
pets_params = {
    'learning_rate': 0.07,
    'max_depth': 7,
    'min_child_weight': 1,
    'n_estimators': 150,
    'reg_alpha': 0.6,
    'reg_lambda': 5.2,
    'subsample': 0.9,
}

for name in full_pets_sets.keys():
    X_full_pets = X_full_pets_sets[name]
    y_full_pets = y_full_pets_sets[name]

    pets = xgb.XGBRegressor(**pets_params)

    pets.fit(X_full_pets, y_full_pets,
               eval_set=[(X_full_pets, y_full_pets)],
               verbose=100)

    pets_models[name] = pets

    print(f"Model trained for {name}")

[0]	validation_0-rmse:4.97184
[100]	validation_0-rmse:2.02226
[149]	validation_0-rmse:1.58528
Model trained for bm_pivot_pets_features
[0]	validation_0-rmse:3.07249
[100]	validation_0-rmse:1.62151
[149]	validation_0-rmse:1.28078
Model trained for sm_pivot_pets_features
[0]	validation_0-rmse:2.57913
[100]	validation_0-rmse:1.21880
[149]	validation_0-rmse:0.93273
Model trained for ukm_pivot_pets_features
[0]	validation_0-rmse:2.42914
[100]	validation_0-rmse:1.11293
[149]	validation_0-rmse:0.83065
Model trained for cvm_pivot_pets_features
[0]	validation_0-rmse:2.06820
[100]	validation_0-rmse:1.06819
[149]	validation_0-rmse:0.84417
Model trained for cvf_pivot_pets_features


In [75]:
def prepare_future_dataframe(truth_df, start_date, test_length, lags):
    future = pd.date_range(start=start_date, periods=test_length, freq='D')
    future_df = pd.DataFrame(index=future)

    future_df['isFuture'] = True
    truth_df['isFuture'] = False

    df_and_future = pd.concat([truth_df, future_df])
    df_and_future = create_features(df_and_future)
    df_and_future = create_lag_feature(df_and_future, 'target', lags)
    df_and_future = create_covid(df_and_future, '2021-08-01')

    return df_and_future

## **Fit Future Data Into Model**

In [76]:
full_health_sets_forecasted = {}

for name, df in full_health_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    health = health_models[name]
    future_df['forecast'] = health.predict(future_df[features])

    new_name = name.replace('pivot_health_features', 'pivot_health_forecasted')
    full_health_sets_forecasted[new_name] = future_df

In [77]:
full_baby_sets_forecasted = {}

for name, df in full_baby_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    baby = baby_models[name]
    future_df['forecast'] = baby.predict(future_df[features])

    new_name = name.replace('pivot_baby_features', 'pivot_baby_forecasted')
    full_baby_sets_forecasted[new_name] = future_df

In [78]:
full_bakery_sets_forecasted = {}

for name, df in full_bakery_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    bakery = bakery_models[name]
    future_df['forecast'] = bakery.predict(future_df[features])

    new_name = name.replace('pivot_bakery_features', 'pivot_bakery_forecasted')
    full_bakery_sets_forecasted[new_name] = future_df

In [79]:
full_drinks_sets_forecasted = {}

for name, df in full_drinks_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    drinks = drinks_models[name]
    future_df['forecast'] = drinks.predict(future_df[features])

    new_name = name.replace('pivot_drinks_features', 'pivot_drinks_forecasted')
    full_drinks_sets_forecasted[new_name] = future_df

In [80]:
full_food_sets_forecasted = {}

for name, df in full_food_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    food = food_models[name]
    future_df['forecast'] = food.predict(future_df[features])

    new_name = name.replace('pivot_food_features', 'pivot_food_forecasted')
    full_food_sets_forecasted[new_name] = future_df

In [81]:
full_free_sets_forecasted = {}

for name, df in full_free_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    free = free_models[name]
    future_df['forecast'] = free.predict(future_df[features])

    new_name = name.replace('pivot_free_features', 'pivot_free_forecasted')
    full_free_sets_forecasted[new_name] = future_df

In [82]:
full_fresh_sets_forecasted = {}

for name, df in full_fresh_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    fresh = fresh_models[name]
    future_df['forecast'] = fresh.predict(future_df[features])

    new_name = name.replace('pivot_fresh_features', 'pivot_fresh_forecasted')
    full_fresh_sets_forecasted[new_name] = future_df

In [83]:
full_frozen_sets_forecasted = {}

for name, df in full_frozen_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    frozen = frozen_models[name]
    future_df['forecast'] = frozen.predict(future_df[features])

    new_name = name.replace('pivot_frozen_features', 'pivot_frozen_forecasted')
    full_frozen_sets_forecasted[new_name] = future_df

In [84]:
full_home_sets_forecasted = {}

for name, df in full_home_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    home = home_models[name]
    future_df['forecast'] = home.predict(future_df[features])

    new_name = name.replace('pivot_home_features', 'pivot_home_forecasted')
    full_home_sets_forecasted[new_name] = future_df

In [85]:
full_household_sets_forecasted = {}

for name, df in full_household_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    household = household_models[name]
    future_df['forecast'] = household.predict(future_df[features])

    new_name = name.replace('pivot_household_features', 'pivot_household_forecasted')
    full_household_sets_forecasted[new_name] = future_df

In [86]:
full_pets_sets_forecasted = {}

for name, df in full_pets_sets.items():
    df_model = df.copy()
    df_model = create_features(df_model)
    df_model = create_lag_feature(df_model, 'target', lags)
    df_model = create_covid(df_model, '2021-08-01')

    start_date = df.index.max() + pd.Timedelta(days=1)
    test_length = 15
    future_df = prepare_future_dataframe(df, start_date, test_length, lags)

    future_df = future_df.drop(columns='isFuture')
    features = [col for col in future_df.columns if col != 'target']

    pets = pets_models[name]
    future_df['forecast'] = pets.predict(future_df[features])

    new_name = name.replace('pivot_pets_features', 'pivot_pets_forecasted')
    full_pets_sets_forecasted[new_name] = future_df

## **Plotting (health, baby, and bakery)**

In [91]:
def plot_forecast(df, title):
    df_filtered = df[(df.index >= '2024-02-14') & (df.index <= '2024-03-15')]
    df_actual = df_filtered[df_filtered.index <= '2024-02-29']
    df_forecast = df_filtered[df_filtered.index > '2024-02-29']

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_actual.index, y=df_actual['target'], mode='lines', name='Actual', line=dict(color='blue')))
    fig.add_trace(go.Scatter(x=df_forecast.index, y=df_forecast['forecast'], mode='lines', name='Forecast', line=dict(color='red')))
    fig.update_layout(title=title, xaxis_title='Date', yaxis_title='Value', legend_title='Legend', template='plotly_white')
    fig.show()

for name, df in full_health_sets_forecasted.items():
    plot_forecast(df, title=name)

In [92]:
def plot_forecast(df, title):
    df_filtered = df[(df.index >= '2024-02-14') & (df.index <= '2024-03-15')]
    df_actual = df_filtered[df_filtered.index <= '2024-02-29']
    df_forecast = df_filtered[df_filtered.index > '2024-02-29']

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_actual.index, y=df_actual['target'], mode='lines', name='Actual', line=dict(color='blue')))
    fig.add_trace(go.Scatter(x=df_forecast.index, y=df_forecast['forecast'], mode='lines', name='Forecast', line=dict(color='red')))
    fig.update_layout(title=title, xaxis_title='Date', yaxis_title='Value', legend_title='Legend', template='plotly_white')
    fig.show()

for name, df in full_baby_sets_forecasted.items():
    plot_forecast(df, title=name)

In [93]:
def plot_forecast(df, title):
    df_filtered = df[(df.index >= '2024-02-14') & (df.index <= '2024-03-15')]
    df_actual = df_filtered[df_filtered.index <= '2024-02-29']
    df_forecast = df_filtered[df_filtered.index > '2024-02-29']

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_actual.index, y=df_actual['target'], mode='lines', name='Actual', line=dict(color='blue')))
    fig.add_trace(go.Scatter(x=df_forecast.index, y=df_forecast['forecast'], mode='lines', name='Forecast', line=dict(color='red')))
    fig.update_layout(title=title, xaxis_title='Date', yaxis_title='Value', legend_title='Legend', template='plotly_white')
    fig.show()

for name, df in full_bakery_sets_forecasted.items():
    plot_forecast(df, title=name)

# **To CSV**

In [90]:
def save_dataframes_to_csv(dataframes_dict, category_name):
    for name, df in dataframes_dict.items():
        filename = f"{category_name}_{name}.csv"
        df.to_csv(filename)

categories = [
    (full_health_sets_forecasted, 'full_health_sets_forecasted'),
    (full_baby_sets_forecasted, 'full_baby_sets_forecasted'),
    (full_bakery_sets_forecasted, 'full_bakery_sets_forecasted'),
    (full_drinks_sets_forecasted, 'full_drinks_sets_forecasted'),
    (full_food_sets_forecasted, 'full_food_sets_forecasted'),
    (full_free_sets_forecasted, 'full_free_sets_forecasted'),
    (full_fresh_sets_forecasted, 'full_fresh_sets_forecasted'),
    (full_frozen_sets_forecasted, 'full_frozen_sets_forecasted'),
    (full_home_sets_forecasted, 'full_home_sets_forecasted'),
    (full_household_sets_forecasted, 'full_household_sets_forecasted'),
    (full_pets_sets_forecasted, 'full_pets_sets_forecasted')
]

# Save all dataframes to CSV files
for dataframes_dict, category_name in categories:
    save_dataframes_to_csv(dataframes_dict, category_name)