In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
import datetime
import pmdarima as pm
from sklearn.metrics import mean_absolute_error as mae
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf,pacf
import scipy.stats as st
import itertools
import warnings
warnings.filterwarnings('ignore')

In [2]:
models = ['ARIMA','SARIMA','Moving Average','Random Forest','XGBoost','Average']

In [3]:
d='P1'

In [4]:
training_periods = int(input('Enter the time periods for training: '))

Enter the time periods for training: 45


In [5]:
forecast_periods = int(input('Enter the time periods for forecast: '))

Enter the time periods for forecast: 12


In [6]:
data = pd.read_csv('P1.csv', index_col='Date', parse_dates=True)
#data['Date']=pd.to_datetime(data['Date'])
#data=data.set_index('Date')

In [7]:
def return_stationary(sales_orignal):
    ctr=0
    if adfuller(sales_orignal)[1]>0.05:
        while(ctr<=3):
            ctr+=1
            sales_orignal = sales_orignal - sales_orignal.shift(1)
            sales_orignal.dropna(inplace=True)
            if adfuller(sales_orignal)[1]<=0.05:
                break
            else:
                continue
    return sales_orignal,ctr

In [8]:
stationary_series,order_of_differencing = return_stationary(data)

In [9]:
def train_test_data_prep_ts(data):
    train_data = data[:training_periods]
    test_data = data[training_periods:]
    return train_data,test_data

In [10]:
def train_test_data_prep_reg(data,periods_of_forecast):
    sales = data.copy(deep=True)
    sales['Lag_p']=sales[d].shift(forecast_periods) # Lag for p periods
    sales['Lag_p1']=sales[d].shift(forecast_periods+1) # Lag for p+1 periods
    sales=sales.dropna()
    #Creating values for model metrics calculation
    x1_v,x2_v,y=sales['Lag_p'],sales['Lag_p1'],sales[d] #Storing values of lags(independent features) and actual sales(dependent feature) in series
    x1_v,x2_v,y=np.array(x1_v),np.array(x2_v),np.array(y) #Converting the series in array
    x1_v,x2_v,y=x1_v.reshape(-1,1),x2_v.reshape(-1,1),y.reshape(-1,1)
    final_x_v=np.concatenate((x1_v,x2_v),axis=1) # Series of independent features
    #print(final_x_v)
    X_train,X_test,y_train,y_test=final_x_v[:-periods_of_forecast],final_x_v[-periods_of_forecast:],y[:-periods_of_forecast],y[-periods_of_forecast:] #Splitting data in train and test
    return X_train,X_test,y_train,y_test

In [11]:
def reg_model_data_generator(data,periods):
    sales = data.copy(deep=True)
    sales['Lag_p']=sales[d].shift(periods) # Lag for p periods
    sales['Lag_p1']=sales[d].shift(periods+1) # Lag for p+1 periods
    sales=sales.dropna()
    x1,x2,y = np.array(sales['Lag_p']),np.array(sales['Lag_p1']),np.array(sales[d])
    x1,x2=x1.reshape(-1,1),x2.reshape(-1,1)
    x_model,y_model = np.concatenate((x1,x2),axis=1),y
    x1_pred,x2_pred = np.array(sales[d][-periods:]),np.array(sales[d][-(periods+1):-1])
    x1_pred,x2_pred=x1_pred.reshape(-1,1),x2_pred.reshape(-1,1)
    x_pred = np.concatenate((x1_pred,x2_pred), axis=1)
    return x_model,y_model,x_pred

In [12]:
train_data,test_data = train_test_data_prep_ts(data)

In [13]:
train_X,test_X,train_Y,test_Y = train_test_data_prep_reg(data,len(test_data))
xmodel,ymodel,xpred = reg_model_data_generator(data,forecast_periods)

In [None]:
dict = {'ARIMA':ar_prediction(train_test_data_prep_ts(data)[0],len(train_test_data_prep_ts(data)[1])),'SARIMA':sarima_prediction(train_test_data_prep_ts(data)[0],len(train_test_data_prep_ts(data)[1])),'Random Forest': rf_prediction(train_X,train_Y,test_X),'XGBOOST':xgb_prediction(train_X,train_Y,test_X)}

In [14]:
def metrics(predictions,targets):
    return round(np.sqrt(((predictions - targets) ** 2).mean()),2),round((np.mean(np.abs((targets - predictions)/targets))*100),2), round(mae(targets, predictions),2)

In [15]:
def visuals(model):
    fig = plt.figure(figsize=(12,6))
    plt.plot(data_test_pred[d])
    plt.plot(data_test_pred[model])
    plt.legend(["Actual","Prediction"], loc ="lower right")
    fig = plt.figure(figsize=(12,6))
    plt.plot(data[d])
    plt.plot(prediction)
    plt.legend(["Actual","Prediction"], loc ="upper right")

In [16]:
def ma12_prediction(sales,periods_of_forecast):
    #sales = sales[[d]]
    model = ARIMA(sales, order=(0,0,9))
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=periods_of_forecast)
    return predictions

In [17]:
def ar_prediction(sales,periods_of_forecast):
    model_auto = pm.auto_arima(sales[d],
                          m = 0, seasonal = False,
                          start_p = 0,start_q=0,max_order=2,test='adf',error_action = 'ignore',
                          suppress_warnings =True,
                          stepwise =True,trace = True)
    model_auto_fit = model_auto.fit(sales[d])
    predictions = model_auto_fit.predict(n_periods=periods_of_forecast)
    return predictions

In [18]:
# Function that takes sales data as input and return predictions for test phase, RMSE, MAE, and MAPE for sarima model
def sarima_prediction(sales,periods_of_forecast):
    p = q = range(0, 3)
    diff= range(order_of_differencing, order_of_differencing+1)
    pdq = list(itertools.product(p, diff, q))
    ARIMA_AIC = pd.DataFrame(columns=['param', 'AIC'])
    for param in pdq:
        ARIMA_model = ARIMA(data[d],order=param).fit()
        #print('ARIMA{} - AIC:{}'.format(param,ARIMA_model.aic))
        ARIMA_AIC = ARIMA_AIC.append({'param':param, 'AIC': ARIMA_model.aic}, ignore_index=True)
    ARIMA_AIC = ARIMA_AIC.sort_values(by='AIC',ascending=True)
    model=sm.tsa.statespace.SARIMAX(sales[d],order=(ARIMA_AIC['param'][0][0], ARIMA_AIC['param'][0][1], ARIMA_AIC['param'][0][2]),seasonal_order=(ARIMA_AIC['param'][0][0],ARIMA_AIC['param'][0][1],ARIMA_AIC['param'][0][2],12))
    results=model.fit()
    predictions=results.predict(start=sales.count()[0],end=sales.count()[0]+periods_of_forecast-1,dynamic=True)
    return predictions

In [19]:
def rf_prediction(X_training,y_training,X_prediction):
    model=RandomForestRegressor(n_estimators=100,max_features=2, random_state=1)
    #Creating values for model metrics calculation
    model.fit(X_training,y_training) # Fitting the model
    predictions=model.predict(X_prediction) # Getting predictions for test phase
    return predictions

In [20]:
def xgb_prediction(X_train,y_train,X_test):
    model = XGBRegressor()
    model.fit(X_train,y_train) # Fitting the model
    predictions=model.predict(X_test) # Getting predictions for test phase
    return predictions

In [None]:
def average_model

In [None]:
# Storing the data of predictions of test data
data_test_pred = test_data.copy(deep=True)
data_test_pred['ARIMA']=ar_prediction(train_data,len(test_data))
data_test_pred['SARIMA']=sarima_prediction(train_data,len(test_data))
data_test_pred['Random Forest']=rf_prediction(train_X,train_Y,test_X)
data_test_pred['Moving Average']=ma12_prediction(train_data,len(test_data))
data_test_pred['XGBoost']=xgb_prediction(train_X,train_Y,test_X)
data_test_pred['Average']=data_test_pred[['ARIMA','Random Forest','Moving Average','XGBoost','SARIMA']].mean(axis=1)

In [None]:
models

In [None]:
dict = {'ARIMA':ar_prediction(train_test_data_prep_ts(data)[0],len(train_test_data_prep_ts(data)[1])),'SARIMA':sarima_prediction(train_test_data_prep_ts(data)[0],len(train_test_data_prep_ts(data)[1])),'Random Forest': rf_prediction(train_X,train_Y,test_X),'XGBOOST':xgb_prediction(train_X,train_Y,test_X)}

In [None]:
PR = dict['ARIMA']

In [None]:
PR

In [None]:
x = dict['ARIMA']

In [None]:
x(train_data,len(test_data))

In [None]:
def model_calling(model):
    predict = model()

In [None]:
metrics_table=pd.DataFrame(columns=models)
for i in models:
    metrics_table[i] = list(metrics(data_test_pred[i],test_data[d]))
list_metric = ['RMSE','MAE','MAPE']
metrics_table['Metric']=list_metric
metrics_table = metrics_table.set_index('Metric')

In [None]:
metrics_table

In [None]:
# Dictionary is created that maps the metric to a numeric value
mapping_dict = {"RMSE":0,"MAE":1,"MAPE":2}
# Takes the input of metric of choice from the user and stores it
user_input_metric = input('Enter the metric: ')

In [None]:
# Compares user preferred metric value and gives the index of winner model
#index 0- Auto Arima
#index 1-Moving Average
#index 2-Random Forest
#index 3-XG Boost
#index 4-Average model of all models
def winner_model(ar,sarima,ma,rf,xgb,avg):
    l=[ar[mapping_dict[user_input_metric]],sarima[mapping_dict[user_input_metric]],ma[mapping_dict[user_input_metric]],rf[mapping_dict[user_input_metric]],xgb[mapping_dict[user_input_metric]],avg[mapping_dict[user_input_metric]]]
    min_element = l[0]
    min_index=0
    for i in range(len(l)):
        if l[i]<min_element:
            min_element=l[i]
            min_index=i
    return(min_index)

In [None]:
winning_model = winner_model(metrics_table['ARIMA'],metrics_table['SARIMA'],metrics_table['Moving Average'],metrics_table['Random Forest'],metrics_table['XGBoost'],metrics_table['Average'])

In [None]:
models = ['ARIMA','SARIMA','Moving Average','Random Forest','XGBoost','Average']

In [None]:
winner = {0:models[0],1:models[1],2:models[2],3:models[3],4:models[4],5:models[5]}

In [None]:
def visualize(prediction_test,prediction_future):
    fig = plt.figure(figsize=(12,6))
    plt.plot(data_test_pred[d])
    plt.plot(prediction_test)
    plt.legend(["Actual","Prediction"], loc ="lower right")
    fig = plt.figure(figsize=(12,6))
    plt.plot(data[d])
    plt.plot(prediction_future)
    plt.legend(["Actual","Prediction"], loc ="upper right")

In [None]:
#Forecasting from winner model
if winner_metric == 0: #Auto ARIMA is the winner
    n_pred = ar_prediction(data,forecast_periods)
    visualize(data_test_pred['ARIMA'],n_pred)
elif winner_metric == 1:
    n_pred = sarima_prediction(data,forecast_periods)
    visualize(data_test_pred['SARIMA'],n_pred)
elif winner_metric == 2: #11 Week MA is winner
    n_pred = ma12_prediction(data,forecast_periods)
    visualize(data_test_pred['Moving Average'],n_pred)
elif winner_metric == 3: #Random Forest is the winner
    #creating values for forecasting final values
    n_pred=rf_prediction(xmodel,ymodel,xpred)
    visualize(data_test_pred['Random Forest'],n_pred)
elif winner_metric == 4: #XG Boost is the winner
    #creating values for forecasting final values
    n_pred=xgb_prediction(xmodel,ymodel,xpred)
    visualize(data_test_pred['XGBoost'],n_pred)
elif winner_metric == 5: #Average model is the winner
    overall_pred = pd.DataFrame(columns=models)
    overall_pred['ARIMA']=ar_prediction(data,forecast_periods)
    overall_pred['SARIMA']=sarima_prediction(data,forecast_periods)
    overall_pred['Random Forest']=rf_prediction(xmodel,ymodel,xpred)
    overall_pred['Moving Average']=ma12_prediction(data,forecast_periods)
    overall_pred['XGBoost']=xgb_prediction(xmodel,ymodel,xpred)
    overall_pred['Average']= overall_pred.mean(axis=1)
    #overall_pred = overall_pred[['Forecast_avg']]
    n_pred = overall_pred['Average']
    visualize(data_test_pred['Average'],n_pred)

In [None]:
overall_pred

In [None]:
rf_prediction(xmodel,ymodel,xpred)

In [None]:
ar_prediction(data,forecast_periods)

In [None]:
ar_metric_return(data,10)