In [1]:
!pwd

/Users/ashish1610dhiman/data_projects/bestbuy/notebooks/ashish


In [2]:
import sys
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

sys.version

sys.path.append("../../")

from src.ad_hmm import sku_predict
from src.utils import *

In [3]:
%load_ext autoreload 
%autoreload 2

In [4]:
VERSION = "v2"

### RMSE analysis for HMM

In [5]:
train_test = pd.read_csv("../../data/train_validation_marker.csv")
train_test["SALES_DATE"] = pd.to_datetime(train_test["SALES_DATE"])
print (train_test.shape)

(846632, 15)


In [6]:
#clean train/test
train = train_test[(train_test.validation==False) & (train_test.validation_clean==True)]
validation = train_test[(train_test.validation==True) & (train_test.validation_clean==True)]
train.shape,validation.shape

((797490, 15), (3815, 15))

### y_actual

In [7]:
y_act = validation[["Encoded_SKU_ID","SALES_DATE","DAILY_UNITS"]].rename(columns = {"DAILY_UNITS":"actual"})
y_act.head()

Unnamed: 0,Encoded_SKU_ID,SALES_DATE,actual
100,96,2022-07-27,2
218,187,2022-07-29,0
420,297,2022-07-29,6
652,372,2022-07-27,1
831,412,2022-07-26,4


### y_pred null

In [8]:
y_pred_null = pd.read_csv("../../data/null_prediction_validation_result_v0.csv").drop(columns = "Unnamed: 0")
y_pred_null["SALES_DATE"] = pd.to_datetime(y_pred_null["SALES_DATE"])
y_pred_null.head()

Unnamed: 0,Encoded_SKU_ID,SALES_DATE,predicted
0,1,2022-07-25,1
1,1,2022-07-26,1
2,1,2022-07-27,1
3,1,2022-07-28,1
4,1,2022-07-29,1


### y_pred HMM

In [9]:
hmm_output = pd.read_csv(f"../../data/hmm_result_{VERSION}.csv",index_col=0)
hmm_output.head()

Unnamed: 0,predicted,predicted_30,predicted_45,predicted_60,predicted_90,predicted_max,Encoded_SKU_ID
2022-07-25,1.0,1.0,1.0,0.0,2.0,3.0,1
2022-07-26,1.0,1.0,2.0,3.0,0.45,2.0,1
2022-07-27,2.0,0.0,3.0,2.0,2.55,1.0,1
2022-07-28,0.0,1.0,0.0,0.45,0.45,2.45,1
2022-07-29,3.0,2.0,1.0,2.0,2.55,2.0,1


In [10]:
hmm_output.index = pd.to_datetime(hmm_output.index)

In [11]:
SKUs_hmm = hmm_output["Encoded_SKU_ID"].unique()
y_pred_null_subset = y_pred_null[y_pred_null.Encoded_SKU_ID.isin(SKUs_hmm)]

In [12]:
len(SKUs_hmm)

352

In [13]:
rmse(y_act,y_pred_null_subset)

1.5694165099864923

In [14]:
for col in hmm_output.columns:
    if "pred" in col:
        y_pred_hmm = hmm_output.reset_index(names = "SALES_DATE")[["Encoded_SKU_ID","SALES_DATE",col\
                                                                  ]].rename(columns={col:"predicted"})
        print (f"For {col}, rmse(hmm) = {rmse(y_act,y_pred_hmm):.4f}")

For predicted, rmse(hmm) = 1.6755
For predicted_30, rmse(hmm) = 1.7331
For predicted_45, rmse(hmm) = 1.8148
For predicted_60, rmse(hmm) = 1.7129
For predicted_90, rmse(hmm) = 1.8105
For predicted_max, rmse(hmm) = 1.8123


### Pick Best predicted date

In [15]:
rmse_sku_hmm_dates = pd.DataFrame()
for i,col in enumerate(hmm_output.columns):
    if "pred" in col:
        y_pred_hmm = hmm_output.reset_index(names = "SALES_DATE")[["Encoded_SKU_ID","SALES_DATE",col\
                                                                  ]].rename(columns={col:"predicted"})
        rmse_sku_hmm_i = rmse_sku(y_act,y_pred_hmm)
        if i ==0:
            rmse_sku_hmm_dates = rmse_sku_hmm_i
        else:
            rmse_sku_hmm_dates = rmse_sku_hmm_dates.merge(rmse_sku_hmm_i, left_index = True, right_index = True,\
                                suffixes = ("",f"_{col.split('_')[-1]}"))

In [16]:
rmse_cols = [col for col in rmse_sku_hmm_dates.columns if "rmse_du" in col]
rmse_cols

['rmse_du',
 'rmse_du_30',
 'rmse_du_45',
 'rmse_du_60',
 'rmse_du_90',
 'rmse_du_max']

In [17]:
rmse_sku_hmm_dates["min_rmse"] = rmse_sku_hmm_dates[rmse_cols].min()

rmse_sku_hmm_dates["argmin_period"] = np.argmin(rmse_sku_hmm_dates[rmse_cols].values,axis=1)

rmse_sku_hmm_dates["min_period"] = rmse_sku_hmm_dates["argmin_period"].apply(lambda x: rmse_cols[x])

In [18]:
period_col_map= {
    "rmse_du":"predicted", "rmse_du_30":"predicted_30", "rmse_du_45":"predicted_45",\
    "rmse_du_60":"predicted_60", "rmse_du_90":"predicted_90", "rmse_du_max":"predicted_max"
}

In [19]:
y_pred_hmm_best = pd.DataFrame()
cnt = 0
for sku_id,best_period in zip(rmse_sku_hmm_dates.index,rmse_sku_hmm_dates["min_period"]):
    best_col = period_col_map[best_period]
    mask_sku = hmm_output.Encoded_SKU_ID==sku_id
    y_pred_hmm_sku = hmm_output.loc[mask_sku].reset_index(names = "SALES_DATE")[["Encoded_SKU_ID","SALES_DATE",best_col\
                                                                  ]].rename(columns={best_col:"predicted"})
    y_pred_hmm_best = pd.concat([y_pred_hmm_best,y_pred_hmm_sku])

In [20]:
rmse(y_act,y_pred_null_subset),\
rmse(y_act,y_pred_hmm_best)

(1.5694165099864923, 1.1607474395864474)

In [22]:
(1.5694165099864923-1.1607474395864474)/1.1607474395864474

0.35207406578096434