In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
import pickle

from tqdm import tqdm 
from downcast import reduce
import warnings
warnings.filterwarnings("ignore")

In [10]:
def take_last_n_days_from_sales(sales, n):
    product_info_col = list(sales.keys()[:6])
    days_col = list(sales.keys()[6:])
    last_n_days = days_col[-n:]
    sales = sales[product_info_col + last_n_days].copy()
    return sales

def sales_label_encoder_test(sales, sales_label_encoders):
    sales['id']=sales.id.str.replace('_evaluation', '')
    sales['id']=sales.id.str.replace('_validation', '')
    product_info_col = list(sales.keys()[:6])
    for cols in product_info_col:
        sales[cols] = sales_label_encoders[cols].transform(sales[cols])
    return sales

def melt_sales(sales):
    sales = sales.melt(id_vars=list(sales.keys()[:6]), var_name='d', value_name='sales')
    sales['d'] = sales['d'].str.replace('d_', '').astype(np.int16)
    return sales

def merge_data(melted_sales, calendar, prices):
    #calendar['d'] = calendar['d'].str.replace('d_', '').astype(np.int16)
    melted_sales = melted_sales.merge(calendar, how='left', left_on='d', right_on='d')
    melted_sales = melted_sales.merge(prices, how='left', on=['store_id', 'item_id', 'wm_yr_wk'])
    return melted_sales

def add_lags(df, lag_days):
    for lag_day in lag_days:
        df[f"lag_{lag_day}"] = df.groupby(["id"])["sales"].transform(lambda x: x.shift(lag_day))
    return df

def add_rolling_mean(df, window):
    for window in window:
        df[f"rolling_mean_{window}"] = df.groupby(["id"])["sales"].transform(lambda x: x.shift(28).rolling(window).mean())
    return df

def add_rolling_std(df, window):
    for window in window:
        df[f"rolling_std_{window}"] = df.groupby(["id"])["sales"].transform(lambda x: x.shift(28).rolling(window).std())
    return df

def add_price_change(featurized_df):
    featurized_df['weekly_price_change'] = featurized_df['sell_price'].shift(7)
    return featurized_df

In [11]:
features = pickle.load(open('pickled_files/x_features.pkl', 'rb'))

In [25]:
sales_label_encoders = pickle.load(open('pickled_files/sales_label_encoders.pkl', 'rb'))
calendar_label_encoders = pickle.load(open('pickled_files/calendar_label_encoders.pkl', 'rb'))
calendar = pd.read_pickle('pickled_files/calendar.pkl')
prices = pd.read_pickle('pickled_files/prices.pkl')
features = pickle.load(open('pickled_files/x_features.pkl', 'rb'))

sales_validation = pd.read_csv(r'dataset/sales_train_validation.csv')
sales_validation.id = sales_validation.id.str.replace('_validation', '')
sales_validation.id = sales_validation.id.str.replace('_evaluation', '')
sales = take_last_n_days_from_sales(sales_validation, 60)


In [26]:
# Downcast df for less memory usage
sales = reduce(sales)
sales = sales_label_encoder_test(sales, sales_label_encoders)
melted_sales = melt_sales(sales)
featurized_df = merge_data(melted_sales, calendar, prices)

In [28]:
featurized_df = add_lags(featurized_df, range(1,30))
featurized_df = add_rolling_mean(featurized_df, [7, 14, 21, 28, 30])
featurized_df = add_rolling_std(featurized_df, [7, 14, 21, 28, 30])
featurized_df = add_price_change(featurized_df)

In [36]:
F = {}
for f in range(1, 29):
    F[f] = pickle.load(open(f'final_models/F{f}.pkl', 'rb'))

In [40]:
forecasts = []
for f in tqdm(range(1, 29)):
    forecasts.append(F[f].predict(x))

100%|██████████| 28/28 [00:02<00:00, 10.52it/s]


In [None]:
rounded_forecast = np.around(forecasts).astype(int)
sales_evaluation = pd.read_csv(r'dataset/sales_train_evaluation.csv')
sales_evaluation.id = sales_evaluation.id.str.replace('_validation', '')
sales_evaluation.id = sales_evaluation.id.str.replace('_evaluation', '')
sales_eval = take_last_n_days_from_sales(sales_evaluation, 28)
sales_eval = sales_label_encoder_test(sales_eval, sales_label_encoders)
sales_eval = reduce(sales_eval)
sales_eval = sales_eval.drop(['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], axis=1)
sales_eval.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

In [76]:
forecasts = pd.DataFrame(forecasts).T
forecasts.columns = [f'F{i}' for i in range(1, 29)]
forecasts['id'] = x.id.values
forecasts = forecasts[['id']+ [f'F{i}' for i in range(1, 29)]].sort_values('id')
forecasts

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
1612,0,1.220954,1.042598,0.793215,0.754084,0.887791,1.050030,1.095591,0.950859,0.897496,...,0.663350,1.001743,0.899801,0.635461,0.583486,0.870002,0.786485,0.863207,1.016367,1.093811
4661,1,1.143503,0.771618,0.809818,0.823055,1.060465,1.420896,1.125353,0.853563,0.876169,...,1.032315,1.167088,0.972696,0.572097,0.815723,0.894453,0.913091,0.909038,1.142795,1.412287
7710,2,0.720245,0.603328,0.594611,0.682880,0.730159,0.803825,0.874214,0.769858,0.672779,...,0.336025,1.062026,0.820791,0.441477,0.564540,0.868736,0.766352,0.769595,1.086390,0.936718
10759,3,0.442793,0.336297,0.399868,0.436992,0.479224,0.520761,0.484444,0.460608,0.446604,...,0.335635,0.446395,0.511491,0.324636,0.371224,0.405127,0.320694,0.451744,0.457854,0.508922
13808,4,0.137699,0.135409,0.162306,0.183696,0.205618,0.179860,0.212497,0.202490,0.210265,...,0.202135,0.272469,0.242528,0.173913,0.224515,0.276798,0.226913,0.268384,0.263232,0.286195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16856,30485,0.264583,0.235483,0.270479,0.253457,0.337028,0.371787,0.391732,0.315810,0.289580,...,0.278756,0.354237,0.254634,0.192921,0.288313,0.280949,0.244491,0.261744,0.387016,0.393225
19905,30486,0.089912,0.076381,0.112245,0.116845,0.162224,0.196137,0.192952,0.146772,0.137223,...,0.154320,0.159382,0.136982,0.130711,0.169794,0.120102,0.112449,0.154843,0.155787,0.187649
22954,30487,0.052005,0.063756,0.061810,0.066045,0.094867,0.102570,0.101097,0.079010,0.078089,...,0.125202,0.120234,0.091586,0.088946,0.080758,0.077969,0.070426,0.108930,0.126696,0.102920
26003,30488,0.064801,0.066616,0.074621,0.076039,0.098895,0.109108,0.110808,0.093972,0.083755,...,0.096779,0.101054,0.092464,0.081634,0.068634,0.064070,0.069554,0.109914,0.073725,0.085883


In [77]:
actual = sales_eval.sort_values('id')
actual

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
1612,0,2,0,0,0,0,1,1,0,6,...,2,3,1,0,0,0,1,0,0,0
4661,1,0,3,0,0,0,1,4,0,0,...,1,0,0,1,1,0,0,1,2,0
7710,2,1,0,1,0,8,1,0,0,1,...,1,2,2,0,0,1,0,3,2,2
10759,3,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,1,1,0,0,0
13808,4,0,0,1,0,1,0,1,1,1,...,1,1,1,1,5,0,2,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16856,30485,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
19905,30486,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,1,2
22954,30487,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
26003,30488,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0


In [80]:
test_rmse = np.sqrt(mean_squared_error(actual.drop('id', axis=1), forecasts.drop('id', axis=1)))
print('Test RMSE is: ', test_rmse)

Test RMSE is:  2.352291054120421
