### Data preprocessing

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import holidays

In [None]:
df=pd.read_csv('/issue.csv')


In [None]:
def process(df):
  df['TransactionDateTime']= pd.to_datetime(df['TransactionDateTime'],dayfirst=True)
  df['date']=df['TransactionDateTime'].dt.date
  df['hour']=df['TransactionDateTime'].dt.hour
  df['volume']=df['TransactionDateTime'].dt.minute
  df=df.drop(['TransactionDateTime'],axis=1)
  d1=df.groupby(['date','hour']).count().reset_index()
  d1=d1.drop(['hour'],axis=1)
  d1=d1.set_index('date')
  return d1

In [None]:
univariate_df=process(df)

**After processing the dataframe for univariate analysis looks like this**

In [33]:
df1=pd.read_csv('/sarvatra.csv')
df1=df1.set_index('DateTime')
df1.head()

Unnamed: 0_level_0,volume
DateTime,Unnamed: 1_level_1
2017-04-01 00:00:00,10
2017-04-01 01:00:00,12
2017-04-01 02:00:00,5
2017-04-01 03:00:00,5
2017-04-01 04:00:00,17


In [34]:
df1.tail()

Unnamed: 0_level_0,volume
DateTime,Unnamed: 1_level_1
2018-07-31 19:00:00,545
2018-07-31 20:00:00,874
2018-07-31 21:00:00,386
2018-07-31 22:00:00,223
2018-07-31 23:00:00,98


**For Multivariate we will add some features engineered from the DateTime column**

In [None]:
df2=pd.DataFrame()
df1=df1.reset_index()
df1['DateTime']=pd.to_datetime(df1['DateTime'],dayfirst=True)
in_holidays=holidays.India()
df2['weekday']=df1['DateTime'].dt.weekday
df2['volume']=df1['volume']
df2['month']=df1['DateTime'].dt.month
df2['hour']=df1['DateTime'].dt.hour
df2['day']=df1['DateTime'].dt.day
df2['is_weekend']=[1 if (val==5 or val==6) else 0 for val in df1['weekday']]
df2['start_of_month']=df1['DateTime'].dt.is_month_start.astype(int)
df2['end_of_month']=df1['DateTime'].dt.is_month_end.astype(int)
df2['holidays']=[1 if str(val).split()[0] in in_holidays else 0 for val in df1['DateTime']]

def daypart(hour):
    if hour in [2,3,4,5]:
        return "dawn"
    elif hour in [6,7,8,9]:
        return "morning"
    elif hour in [10,11,12,13]:
        return "noon"
    elif hour in [14,15,16,17]:
        return "afternoon"
    elif hour in [18,19,20,21]:
        return "evening"
    else: return "midnight"
# utilize it along with apply method
raw_dayparts = df2.hour.apply(daypart)
# one hot encoding
dayparts = pd.get_dummies(raw_dayparts)
# re-arrange columns for convenience
dayparts = dayparts[['dawn','morning','noon','afternoon','evening','midnight']]
#display data
dayparts

In [None]:
df2=pd.concat([df2,dayparts],axis=1)

In [None]:
df2=df2.set_index('DateTime')

In [None]:
df2=df2[['month','day','weekday','start_of_month','end_of_month','is_weekend','holidays','hour','dawn','morning','noon','afternoon','evening','midnight','volume']]

**After the above steps our data will look like this**

In [30]:
df3=pd.read_csv('/presentationfinal.csv')
df3=df3.set_index('DateTime')
df3.head()

Unnamed: 0_level_0,month,day,weekday,start_of_month,end_of_month,is_weekend,holidays,hour,dawn,morning,noon,afternoon,evening,midnight,volume
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-04-01 00:00:00,4,1,5,1,0,1,0,0,0,0,0,0,0,1,10
2017-04-01 01:00:00,4,1,5,1,0,1,0,1,0,0,0,0,0,1,12
2017-04-01 02:00:00,4,1,5,1,0,1,0,2,1,0,0,0,0,0,5
2017-04-01 03:00:00,4,1,5,1,0,1,0,3,1,0,0,0,0,0,5
2017-04-01 04:00:00,4,1,5,1,0,1,0,4,1,0,0,0,0,0,17


### Scaling and models

In [43]:
from pickle import dump,load

In [None]:
scaler = load(open('/univariatescaler.pkl', 'rb'))
scaler1 = load(open('/multivariatescaler.pkl', 'rb'))

In [None]:
def split_sequence(sequence,n_steps):
  X,y=list(),list()
  for i in range(len(sequence)):
    end_ix=i+n_steps
    if end_ix>len(sequence)-1:
      break
    seq_x,seq_y=sequence[i:end_ix],sequence[end_ix]
    X.append(seq_x)
    y.append(seq_y)
  return np.array(X),np.array(y)

## Forecasting

### Univariate

In [45]:
import tensorflow as tf
lstm=tf.keras.models.load_model('/lstm168final.h5')
hybrid=tf.keras.models.load_model('/combined2.h5')
cnn=tf.keras.models.load_model('/cnn3.h5')

**For ensemble forecasting we use the following code**

In [None]:
x_input=df1[10776:10944]#x_input will contain 168 values prior to the day chosen for forecasting
#So if we want to forecast for 1st of july, in our dataframe 1st july starts from 10944 so we select 168 values before it hence we have started 
#from 10776. For each consecutive day we just add 24 to start index and ending index.
x_input=scaler.transform(x_input)
x_input=x_input.reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()
from numpy import array
m1=hybrid
lst_output=[]
n_steps=2
n_length=84
n_st=168
i=0
while(i<24):#value of i is the hours of a day.For example day 1 has 0-24 hours,for day 2 24-48hours, for day 3 48-72hours and so on.
    if(len(temp_input)>168):
        x_input=np.array(temp_input[1:])
        x_input=x_input.reshape(1,-1)
        if m1==hybrid:  
          x_input = x_input.reshape((1,n_steps,n_length,1))
          yhat = hybrid.predict(x_input, verbose=0)
          m1=cnn
        elif m1==cnn:
          x_input = x_input.reshape((1,n_st,1))
          yhat = cnn.predict(x_input, verbose=0)
          m1=lstm
        elif m1==lstm:
          x_input = x_input.reshape((1,n_st,1))
          yhat = lstm.predict(x_input, verbose=0)
          m1=hybrid
        print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        temp_input=temp_input[1:]
        lst_output.extend(yhat.tolist())
        i=i+1
    else:
        x_input = x_input.reshape((1,n_steps,n_length,1))
        yhat = hybrid.predict(x_input, verbose=0)
        print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        lst_output.extend(yhat.tolist())
        i=i+1
print(lst_output)

**For forecasting using only univariate lstm we use the following**

In [None]:
x_input=df1[10776:10944]#x_input will contain 168 values prior to the day chosen for forecasting
#So if we want to forecast for 1st of july, in our dataframe 1st july starts from 10944 so we select 168 values before it hence we have started 
#from 10776. For each consecutive day we just add 24 to start index and ending index
x_input=scaler.transform(x_input)
x_input=x_input.reshape(1,-1)
temp_input=list(x_input)
temp_input=temp_input[0].tolist()
from numpy import array
lst_output=[]
n_st=168
i=0
while(i<24):#value of i is the hours of a day.For example day 1 has 0-24 hours,for day 2 24-48hours, for day 3 48-72hours and so on.
    if(len(temp_input)>168):
        x_input=np.array(temp_input[1:])
        x_input=x_input.reshape(1,-1)
        x_input = x_input.reshape((1,n_st,1))
        yhat = lstm.predict(x_input, verbose=0)
        print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        temp_input=temp_input[1:]
        lst_output.extend(yhat.tolist())
        i=i+1
    else:
        x_input = x_input.reshape((1,n_st,1))
        yhat = lstm.predict(x_input, verbose=0)
        print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        lst_output.extend(yhat.tolist())
        i=i+1
print(lst_output)

**After forecasting using any of the above methods we will display the results**

In [None]:
y_pred=np.array(lst_output)
y_pred=scaler.inverse_transform(y_pred)

July=pd.DataFrame()
July['predicted']=y_pred.reshape(-1)


In [None]:
actual=df[10944:10968]
actual

In [None]:
import plotly.graph_objects as go
# Create random data with numpy
fig = go.Figure()
fig.add_trace(go.Scatter(x=actual.index, y=actual.volume,
                    mode='lines',
                    name='all_predicted'))
fig.add_trace(go.Scatter(x=actual.index, y=July['predicted'],
                    mode='lines',
                    name='order_changed_predicted'))

fig.update_xaxes(rangeslider_visible=True)
fig.show()

### Multivariate

In [None]:
import tensorflow as tf
model=tf.keras.models.load_model('/multilstmfinal.h5')

In [None]:
#For forecasting daywise for month of july
test=df3.iloc[10776:]#This yor data which contains 168 values before 1st july.
testsc=scaler1.transform(test)
X_test,y_test=split_sequence(testsc,168)
y=y_test[:,-1].reshape(-1,1)

In [None]:
x_input=df3[10776:10944] #x_input will contain 168 values prior to the day chosen for forecasting
#So if we want to forecast for 1st of july, in our dataframe 1st july starts from 10944 so we select 168 values before it hence we have started 
#from 10776. For each consecutive day we just add 24 to start index and ending index  
x_input=scaler1.transform(x_input)
num_features=15
x_input=x_input.reshape(1,168,num_features) 
temp_input=list(x_input)
temp_input=temp_input[0].tolist()
lst_output=[]
n_steps=168
i=0
while(i<24):#value of i is the hours of a day.For example day 1 has 0-24 hours,for day 2 24-48hours, for day 3 48-72hours and so on.
  if(len(temp_input)>168):
    x_input=np.array(temp_input[1:])
    x_input=x_input.reshape(1,-1)
    x_input = x_input.reshape((1,n_steps,num_features))
    yhat=model.predict(x_input,verbose=0)
    print("{} day output {}".format(i,yhat))
    df_july1=pd.concat([pd.DataFrame(y_test[i:i+1,:-1]),pd.DataFrame(yhat)],axis=1)
    df_july1=scaler1.inverse_transform(df_july1)
    df_july1=pd.DataFrame(df_july1)
    temp_input.extend(list(scaler1.transform(df_july1).reshape(1,1,num_features))[0].tolist())
    temp_input=temp_input[1:]
    lst_output.extend(yhat.tolist())
    i=i+1
  else:
    x_input = x_input.reshape((1,n_steps,num_features))
    yhat = model.predict(x_input, verbose=0)
    #print(yhat[0])
    df_july1=pd.concat([pd.DataFrame(y_test[i:i+1,:-1]),pd.DataFrame(yhat)],axis=1)
    df_july1=scaler1.inverse_transform(df_july1)
    df_july1=pd.DataFrame(df_july1)
    temp_input.extend(list(scaler1.transform(df_july1).reshape(1,1,num_features))[0].tolist())
    #temp_input.extend(yhat[0].tolist())
    #print(len(temp_input))
    lst_output.extend(yhat.tolist())
    i=i+1
print(lst_output)

**After forecasting we will display them**

In [None]:
y_pred=np.array(lst_output)
df_july24=pd.concat([pd.DataFrame(y_test[i-24:i,:-1]),pd.DataFrame(y_pred)],axis=1)
df_july24=scaler1.inverse_transform(df_july24)
df_july24=pd.DataFrame(df_july24)#This data frame constains our forecasted values

actual=pd.concat([pd.DataFrame(y_test[:,:-1]),pd.DataFrame(y)],axis=1)
actual=scaler1.inverse_transform(actual)
actual=pd.DataFrame(actual)#This contains the actual values for the same day
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=actual.index, y=actual[14][i-24:i],#use the same value that is set for i
                    mode='lines',
                    name='actual'))
fig.add_trace(go.Scatter(x=actual.index, y=df_july24[14],
                    mode='lines',
                    name='predicted'))
fig.update_xaxes(rangeslider_visible=True)
fig.show()

### Regression

**Even for regression we will use the same features that were used for multivariate lstm**

In [47]:
df4=pd.read_csv('/presentationfinal.csv')
df4=df4.set_index('DateTime')
df4.head()

Unnamed: 0_level_0,month,day,weekday,start_of_month,end_of_month,is_weekend,holidays,hour,dawn,morning,noon,afternoon,evening,midnight,volume
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-04-01 00:00:00,4,1,5,1,0,1,0,0,0,0,0,0,0,1,10
2017-04-01 01:00:00,4,1,5,1,0,1,0,1,0,0,0,0,0,1,12
2017-04-01 02:00:00,4,1,5,1,0,1,0,2,1,0,0,0,0,0,5
2017-04-01 03:00:00,4,1,5,1,0,1,0,3,1,0,0,0,0,0,5
2017-04-01 04:00:00,4,1,5,1,0,1,0,4,1,0,0,0,0,0,17


In [48]:
X=df4.iloc[:,:-1]
y=df4.iloc[:,-1]

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle = False)
#here we split the data with a ratio of 80:20 i.e 80% in training and 20% in test 

In [51]:
from sklearn.preprocessing import MinMaxScaler
scaler3=MinMaxScaler()
X_train=scaler3.fit_transform(X_train)
X_test=scaler3.transform(X_test)

In [60]:
dump(scaler3, open('regressionscaler.pkl', 'wb'))

In [52]:
from sklearn import datasets, ensemble
# define the model parameters
params = {'n_estimators': 500,
          'max_depth': 4,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}
# instantiate and train the model
gb_reg = ensemble.GradientBoostingRegressor(**params)
gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=4,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=5,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [53]:
len(X_test)

1169

**We have 1169 values in our test set of which 744 are from july.
So 425 values are from june**

In [59]:
df4.index.astype(str).tolist()[-len(X_test):][0+i:24+i]

['2018-07-01 00:00:00',
 '2018-07-01 01:00:00',
 '2018-07-01 02:00:00',
 '2018-07-01 03:00:00',
 '2018-07-01 04:00:00',
 '2018-07-01 05:00:00',
 '2018-07-01 06:00:00',
 '2018-07-01 07:00:00',
 '2018-07-01 08:00:00',
 '2018-07-01 09:00:00',
 '2018-07-01 10:00:00',
 '2018-07-01 11:00:00',
 '2018-07-01 12:00:00',
 '2018-07-01 13:00:00',
 '2018-07-01 14:00:00',
 '2018-07-01 15:00:00',
 '2018-07-01 16:00:00',
 '2018-07-01 17:00:00',
 '2018-07-01 18:00:00',
 '2018-07-01 19:00:00',
 '2018-07-01 20:00:00',
 '2018-07-01 21:00:00',
 '2018-07-01 22:00:00',
 '2018-07-01 23:00:00']

In [56]:
i=425
julyregression=pd.DataFrame()
julyregression['predicted']=gb_reg.predict(X_test)[0+i:24+i]#gb_reg is the gradient boost regressor which is our machine learning model
#0 and 24 are the hours of the day.
#If we want to predict for day 2 the we would write 24 and 48 and so on. 

In [61]:
dump(gb_reg, open('regressionmodel.pkl', 'wb'))

In [None]:
model = load(open('/regressionmodel.pkl', 'rb'))