**Import packages**

In [1]:
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor
import math # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [6]:
 karo= pd.read_excel('~/Eczacibasi-kaggle/dataset/data.xlsx')

**Data Exploration**

In [7]:
customer_A=karo[karo.customer=='A'].item.unique()

In [8]:
customer_B=karo[karo.customer=='B'].item.unique()

In [9]:
customer_C=karo[karo.customer=='C'].item.unique()

In [10]:
karo['Year']=pd.to_datetime(karo.date).dt.year
karo['Month']=pd.to_datetime(karo.date).dt.month
karo['Date']=pd.to_datetime(karo.assign(Day=1).loc[:, ['Year','Month','Day']])
karo.Date=karo.Date.apply(lambda x: x.strftime("%Y-%m"))

In [11]:
karo_agg=pd.DataFrame(karo.groupby(['customer','item','Date']).sum()).reset_index()
karo_agg.drop(['Year','Month'],axis=1,inplace=True)
karo_agg=karo_agg.sort_values(by=['item','Date'])

In [12]:
karo_agg.head()

Unnamed: 0,customer,item,Date,order
0,A,0,2018-01,216.0
1,A,0,2018-02,576.0
2,A,0,2018-03,288.0
3,A,0,2018-05,216.0
4,A,0,2018-06,216.0


In [13]:
# Calculate train data size for each store & department combinations
period_length=[]
store_index=[]
for i in karo_agg.item.unique():
            store_index += [i]
            perleng=len(karo_agg[(karo_agg.item==i)])
            df=karo_agg[karo_agg.item==i]['order']
            period_length += [perleng]

In [14]:
pd.set_option('display.max_rows', None)
store_index = pd.DataFrame(store_index)
store_index=store_index.rename(columns={0:'Store'})
period_length =pd.DataFrame(period_length)
train_data_size=pd.concat([store_index,period_length],axis=1)
train_data_size=train_data_size.rename(columns={0:'Size'})

In [15]:
Dates=karo_agg[karo_agg.item==2][['Date']]

In [16]:
df=pd.DataFrame()
for i in karo_agg['item'].unique():
    order=karo_agg[karo_agg.item==i]
    df_inter=pd.merge(Dates,order,on='Date',how='left')
    df_inter['item']=df_inter['item'].fillna(i)
    df_inter['customer']=df_inter['customer'].fillna(np.array(df_inter['customer'].unique()[0]))
    df_inter['order']=df_inter['order'].fillna(df_inter['order'].median())
    df=df.append(df_inter, ignore_index=True)

In [17]:
karo_agg=df.copy()

In [18]:
train_date=np.array(Dates.Date[:25])
valid_date=np.array(Dates.Date[25:])

In [19]:
karo_train=karo_agg.loc[karo_agg['Date'].isin(train_date)].reset_index(drop=True)
karo_valid=karo_agg.loc[karo_agg['Date'].isin(valid_date)].reset_index(drop=True)

In [20]:
sample_9_train=karo_train[karo_train.item==79]
sample_9_valid=karo_valid[karo_valid.item==79]

**STATISTICAL FORECASTING - HOLT WINTERS**

In [21]:
def seasonal_factors_mul(s,d,slen,cols):
    for i in range(slen):
        idx=[x for x in range(cols) if x%slen==i] #compute indices that correspond to this season
        s[i]=np.mean(d[idx]) #compute season average
    s -=np.mean(s[:slen])
    return s

In [22]:
def triple_exp_smooth_mul_t(z,slen,extra_periods,alpha,beta,phi,gamma,item):
    m=z.Date
    d=z.order
    d=np.array(d) #transform the input into a numpy array
    cols=len(d) #historical period length
    d=np.append(d,[np.nan]*extra_periods) #append np.nan into the demand array to cover future periods
    m=np.append(m,[np.nan]*extra_periods)
    #components initialization
    f,a,b,s,z,e=np.full((6,cols+extra_periods),np.nan)
    s=seasonal_factors_mul(s,d,slen,cols)
    
    #level & trend initialization
    a[0]=d[0]-s[0]
    b[0]=(d[1]-s[1])-(d[0]-s[0])
    z[0]=0

    #create the forecast first season
    for t in range(1,slen):
        f[t]=a[t-1]+phi*b[t-1]+s[t]
        a[t]=alpha*(d[t]-s[t])+(1-alpha)*(a[t-1]+phi*b[t-1])
        b[t]=beta*(a[t]-a[t-1])+(1-beta)*phi*b[t-1]
        z[t]=0
        e[t]=i
    
    #create all the t+1 forecasts
    for  t in range(slen,cols):
        f[t]=a[t-1]+phi*b[t-1]+s[t-slen]
        a[t]=alpha*(d[t]-s[t-slen])+(1-alpha)*(a[t-1]+phi*b[t-1])
        b[t]=beta*(a[t]-a[t-1])+(1-beta)*phi*b[t-1]
        s[t]=gamma*(d[t]-a[t])+(1-gamma)*s[t-slen]
        z[t]=0
        e[t]=i

    #forecast for all extra periods
    for t in range(cols,cols+extra_periods):
        f[t]=a[t-1]+phi*b[t-1]+s[t-slen]      
        a[t]=f[t]-s[t-slen]
        b[t]=phi*b[t-1]
        s[t]=s[t-slen]
        z[t]=1
        e[t]=i
        
    df=pd.DataFrame.from_dict({'Item':i,'Date':m,'Demand':d,'Forecast':f,'Flag':z,'Level':a,'Trend':b,'Season':s,'Error':d-f})
    return df

In [23]:
import sklearn.metrics
from sklearn.metrics import mean_squared_error

In [25]:
import warnings
warnings.filterwarnings("ignore")

# Tuned Optimization: Triple Expo     
RMSE={}
forecast_model={}

alpha=[0.13,0.15,0.16]
beta=[0,0,0]
phi=[0,0,0]
gamma=[0.015,0.020,0.025]

min_error=float('inf')
min_alph_index=0
min_beta_index=0
min_phi_index=0
min_gamma_index=0
opt_index = 0

inc=0 #alpha iteration
ite=0 #beta iteration
ipe=0 #phi iteration
ige=0 #gamma iteration

outputs=[]
inc_i=0
for i in karo_agg.item.unique():
        #if i==9:
            df=karo_agg[(karo_agg.item==i)][['Date','order']]
            df=df.sort_values(by='Date',ascending=True)

            min_error=float('inf')

            if i in karo_agg['item'].unique():
                outputs=[]
                inc=0
                size=len(df)
                for t in alpha:
                    ite=0
                    for z in beta:
                        ipe=0
                        for r in phi:
                            ige=0
                            for w in gamma:
                                df_triple=triple_exp_smooth_mul_t(df,slen=12,extra_periods=6,alpha=t,beta=z,phi=r,gamma=w,item=i)
                                future_date=np.array(['2020-8', '2020-9', '2020-10', '2020-11', '2020-12', '2021-1'])
                                for h in range(0,6): # add dates of validation set
                                    df_triple.Date[size+h]=future_date[h]
                                outputs.append(df_triple[['Date','Item','Forecast']][-6:]) # append all dataframe for each store & dept combination
                                #rmse_score = math.sqrt(np.square(np.subtract(df_triple.Demand[size-6:size],df_triple.Forecast[size-6:size])).mean())
                                rmse_score = math.sqrt(sklearn.metrics.mean_squared_error(df_triple.Demand[size-6:size],df_triple.Forecast[size-6:size]))
                                if rmse_score < min_error:
                                    min_error=rmse_score
                                    min_alph_index=inc
                                    min_beta_index=ite
                                    min_phi_index=ipe
                                    min_gamma_index=ige
                                    opt_index = (min_alph_index*27 + min_beta_index*9 + min_phi_index*3 + min_gamma_index +1) -1 # record opt. index
                                ige=ige+1
                            ipe=ipe+1
                        ite=ite+1
                    inc=inc+1
                opt_alpha=alpha[min_alph_index]
                opt_beta=beta[min_beta_index]
                opt_phi=phi[min_phi_index]
                opt_gamma=gamma[min_gamma_index]
                forecast_model[i]=outputs[opt_index]
                RMSE[i,opt_alpha,opt_beta,opt_phi,opt_gamma]=round(rmse_score,4)
            else:   
                RMSE[i,0,0,0,0]=0

In [26]:
# Evaluate performance of the Multiplicative Triple Exponential Model
item=[(k[0]) for k,v in RMSE.items()]
item=pd.DataFrame(item,columns=['Item'])

alpha=[(k[1]) for k,v in RMSE.items()]
alpha=pd.DataFrame(alpha,columns=['Alpha_Triple'])

beta=[(k[2]) for k,v in RMSE.items()]
beta=pd.DataFrame(beta,columns=['Beta_Triple'])

phi=[(k[3]) for k,v in RMSE.items()]
phi=pd.DataFrame(phi,columns=['Phi'])

gamma=[(k[4]) for k,v in RMSE.items()]
gamma=pd.DataFrame(gamma,columns=['Gamma'])

RMSE_per_trip=[v for k,v in RMSE.items()]
RMSE_per_trip=pd.DataFrame(RMSE_per_trip,columns=['TripleExp_Result'])

triple_performance=pd.concat([item,alpha,beta,phi,gamma,RMSE_per_trip],axis=1)
triple_performance.dropna(subset=['TripleExp_Result'], how='all', inplace=True)
triple_performance.head()

Unnamed: 0,Item,Alpha_Triple,Beta_Triple,Phi,Gamma,TripleExp_Result
0,0.0,0.15,0,0,0.015,925.7702
1,1.0,0.16,0,0,0.015,839.6644
2,2.0,0.16,0,0,0.015,594.9351
3,3.0,0.13,0,0,0.015,1151.2425
4,4.0,0.13,0,0,0.015,469.5487


In [27]:
final_triple=triple_performance.copy()
final_triple['Method']='TripleExpo'

In [28]:
final_triple['TripleSquare']=final_triple.TripleExp_Result.apply(lambda x:x**2)
math.sqrt(final_triple['TripleSquare'].mean())

1228.2275378393128

In [29]:
final_triple['TripleExp_Result'].mean()

967.1602024096385

In [37]:
final_triple.sort_values(by='TripleExp_Result',ascending=False).head()

Unnamed: 0,Item,Alpha_Triple,Beta_Triple,Phi,Gamma,TripleExp_Result,Method,TripleSquare
70,70.0,0.16,0,0,0.015,3481.9822,TripleExpo,12124200.0
9,9.0,0.13,0,0,0.025,3478.6329,TripleExpo,12100890.0
11,11.0,0.13,0,0,0.015,2962.6256,TripleExpo,8777150.0
71,71.0,0.16,0,0,0.015,2615.6188,TripleExpo,6841462.0
79,79.0,0.13,0,0,0.015,2563.4714,TripleExpo,6571386.0


In [38]:
# Evaluate performance of the Multiplicative Triple Exponential Model
nrow=len(forecast_model.keys())
df_forecast_tuned=pd.DataFrame()
for i in range(nrow):
    df_inter=forecast_model[list(forecast_model.keys())[i]]
    df_forecast_tuned=df_forecast_tuned.append(df_inter)

In [39]:
df_forecast_tuned=df_forecast_tuned.reset_index(drop=True)

In [40]:
df_forecast_tuned.head()

Unnamed: 0,Date,Item,Forecast
0,2020-8,0.0,1904.25568
1,2020-9,0.0,2120.533156
2,2020-10,0.0,1127.196011
3,2020-11,0.0,878.173438
4,2020-12,0.0,1353.29425


------

In [41]:
final_submit=df_forecast_tuned

In [42]:
final_submit.reset_index(drop=True,inplace=True)

In [43]:
len(final_submit)

498

**Transaction ID**

In [46]:
id_data= pd.read_excel('~/Eczacibasi-kaggle/dataset/id.xlsx')

In [49]:
id_data.head()

Unnamed: 0,id,customer,item,date
0,1,A,0,2020-10
1,2,A,0,2020-11
2,3,A,0,2020-12
3,4,A,0,2020-8
4,5,A,0,2020-9


In [50]:
id_data.columns=['id','customer','Item','Date']
id_data.head()

Unnamed: 0,id,customer,Item,Date
0,1,A,0,2020-10
1,2,A,0,2020-11
2,3,A,0,2020-12
3,4,A,0,2020-8
4,5,A,0,2020-9


In [51]:
final=pd.merge(left=id_data,right=final_submit,on=['Date','Item'],how='right')
final=final[['id','Forecast']]
final=final.sort_values(by='id')

In [52]:
len(final)

498

In [54]:
final=final.reset_index(drop=True)
final.head()

Unnamed: 0,id,Forecast
0,1,1127.196011
1,2,878.173438
2,3,1353.29425
3,4,1904.25568
4,5,2120.533156


**Write csv**

In [56]:
final.to_csv('forecast.csv',index=False)

-----