In [2]:
import pandas as pd
import numpy as np
import re
import pickle

In [6]:
def final_pipeline(ids):
    """
    Final Pipeline in which only Ids should be given
    """
    df = pd.read_csv('sales_train_evaluation.csv') #reading sales_train dataframe
    df = df[df['id']==ids]  #extracting rows only with ids mentioned
    
    #adding extra days i.e. day 1942 to day 1969. 28 days required for prediction 
    for i in range(1942,1970):
        col_name = 'd_'+str(i)
        df[col_name] = 0
        df[col_name] = df[col_name].astype(np.int16) #total I have to predict 28 days for private score i.e. d_1941 to d_1969....so filling these spaces with zero.  
    
    #melting the dataframe
    df = pd.melt(df, 
             id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
             var_name='d', 
             value_name='sale').dropna()
    
    sell = pd.read_csv('sell_prices.csv')
    cal = pd.read_csv('calendar.csv')
    cal.fillna(0,inplace=True)
    cal.drop(columns=['wday', 'month'],inplace = True)
    df = pd.merge(df, cal, on='d', how='left')
    df.dropna(inplace = True)
    del cal
    
    
    df['d'] = df['d'].apply(lambda x: int(x.split('_')[1])).astype(np.int16) #extracting just day number. string to integer
    df['year'].replace({2011:1,2012:2,2013:3,2014:4,2015:5,2016:6},inplace=True) #replacing year to categorical value
    df['year'] = df['year'].astype(np.int16)
    df['day'] = df['weekday'].apply(lambda x: 1 if x == 'Saturday' else ( 1 if x== 'Sunday' else 0)).astype(np.int8)
    df['quarter'] = pd.to_datetime(df['date']).dt.quarter.astype(np.int8)
    df.drop(columns = ['weekday','date'],inplace=True) #dropping weekday column
    
    #Label encoding
    le = pd.read_pickle('event_label.pickle')
    for ele in ['event_name_1','event_name_2','event_type_1','event_type_2']:
        df = df.replace({ele:le[ele]})
        
    #merging sale train dataframe and seel_price dataframe
    df = pd.merge(df, sell, on=['store_id','item_id','wm_yr_wk'], how='left')
    del sell
    df.fillna(0,inplace=True)
    
    #Label Encoding
    le = pd.read_pickle('label_encoding.pickle')
    for ele in df.columns:
        if df[ele].dtype.name == 'object':
            df = df.replace({ele:le[ele]})
    
    groupby_df = df.groupby(['id'],as_index=False)
    
    #Extracting lag Features
    lags = [28,35,42,49,56,63,70,77] 
    for ele in lags:
        df['lag_'+str(ele)] = groupby_df['sale'].shift(ele)
        
    #extracting rolling mean feature
    lags = [7,14,28]  
    for ele in lags:
        df['roll_mean_'+str(ele)] = groupby_df['sale'].transform(lambda x: x.shift(28).rolling(ele).mean()) #https://stackoverflow.com/questions/53339021/python-pandas-calculate-moving-average-within-group
        df['roll_std_'+str(ele)] = groupby_df['sale'].transform(lambda x: x.shift(28).rolling(ele).std())
        
    #extracting other features like exanding mean, mead_id_sold.....
    df['expanding_mean_item']=groupby_df['sale'].transform(lambda x: x.shift(28).expanding().mean())
    df['mean_id_sold'] = groupby_df['sale'].transform('mean').astype(np.float16)
    df['std_id_sold'] = groupby_df['sale'].transform('std').astype(np.float16)
    df['mean_id_price'] = groupby_df['sell_price'].transform('mean').astype(np.float16)
    df['std_id_price'] = groupby_df['sell_price'].transform('std').astype(np.float16)
    del groupby_df
    df.dropna(how='any',inplace=True)
    df.drop(columns = ['wm_yr_wk'],inplace=True)
    
    df = df[df['d']>1913] #28+28 days of prediction
    
    
    cat = ['id','item_id','dept_id','cat_id','store_id','state_id','year',
       'event_name_1','event_name_2','event_type_1','event_type_2','snap_CA','snap_TX','snap_WI',
      'day','quarter']
    
    for ele in cat:
        df[ele] = pd.Series(df[ele],dtype='category')
    
    model = pickle.load(open('LightGBM_model.sav', 'rb'))
    
    df['sale_pred'] = np.round(model.predict(df.drop(columns = ['sale']))).clip(0).astype(np.int16)
    final_df = pd.DataFrame([ids],columns = ['id'])
    for ele in range(1914,1970):
        final_df['d_'+str(ele)] = df[df['d']==ele]['sale_pred'].tolist()
    
    
    return final_df
        
    

In [7]:
df = final_pipeline('FOODS_3_125_WI_3_evaluation')

In [8]:
df.head()

Unnamed: 0,id,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,FOODS_3_125_WI_3_evaluation,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,1,1,2,2


# Refrences

https://medium.com/@shantanuekhande19/m5-forecasting-accuracy-73343a873685 <br>
https://medium.com/analytics-vidhya/case-study-on-m5-forecasting-accuracy-kaggle-competition-893d7e124b54 <br>
https://towardsdatascience.com/m5-forecasting-accuracy-24d7f42130de<br>
https://github.com/aakashveera/M5-Accuracy/blob/master/Notebook%20M5%20Accuracy.ipynb <br>
https://www.kaggle.com/code/anshuls235/time-series-forecasting-eda-fe-modelling/notebook<br>
https://www.appliedaicourse.com/<br>