In [102]:
import pandas as pd
import datetime as dt
import numpy as np
# from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import math
import torch.nn.functional as F

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt

In [108]:
# Load rawdata using Pandas
data = pd.read_csv('airfares.csv')
data2 = pd.read_csv('holidays.csv')

In [109]:
data.head()

Unnamed: 0,Date,ID,Price
0,2017-01-01,0,46.91
1,2017-01-01,1,68.12
2,2017-01-01,2,77.69
3,2017-01-01,3,57.44
4,2017-01-01,4,43.88


In [110]:
# Function to get year and month only from "Date" column
def spliter(data):
    a,b,c = data.split('-')
    return a+'-'+b

# Function to calculate average monthly price and add in "Term" data
def monthly_average_with_term(df1,df2):
    period = df1['Date'].apply(spliter)
    df1['Year-Month'] = period
    data = df1.groupby('Year-Month')
    data = data.mean()
    df2.columns=['Year-Month','Term']
    final = pd.merge(data, df2, on="Year-Month")
    del final['ID']
    return final

In [111]:
# New output with three fields only: year-month, average monthly price and term
df=monthly_average_with_term(data,data2)

print(df)
print('Length: ',len(df))

   Year-Month      Price  Term
0     2017-01  58.551110     0
1     2017-02  59.792207     0
2     2017-03  82.417306     1
3     2017-04  59.822363     0
4     2017-05  64.695732     0
5     2017-06  61.664843     2
6     2017-07  59.031855     0
7     2017-08  54.484890     0
8     2017-09  71.092687     3
9     2017-10  63.358058     0
10    2017-11  62.707230     0
11    2017-12  84.881152     4
12    2018-01  59.918052     0
13    2018-02  63.873096     0
14    2018-03  78.880468     1
15    2018-04  61.793867     0
16    2018-05  60.951487     0
17    2018-06  69.653710     2
18    2018-07  62.716990     0
19    2018-08  61.644003     0
20    2018-09  72.645490     3
21    2018-10  68.648984     0
22    2018-11  66.377553     0
23    2018-12  92.485945     4
24    2019-01  59.988210     0
25    2019-02  60.409186     0
26    2019-03  60.937274     0
27    2019-04  75.562997     1
28    2019-05  56.166297     0
29    2019-06  64.860803     2
30    2019-07  60.743268     0
31    20

In [165]:
scaler = MinMaxScaler()

aaa = np.array(list(df['Price']))
scaler.fit(aaa.reshape(-1,1)[:26])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [199]:
scaled_price = scaler.transform(aaa.reshape(-1,1))
scaled_price[:10]

array([[0.1070028 ],
       [0.13966236],
       [0.73504318],
       [0.14045592],
       [0.26869891],
       [0.18894089],
       [0.11965364],
       [0.        ],
       [0.43703514],
       [0.23349793]])

### Preparation of training data ####################

In [200]:
X = []

#To convert year-month into numbers as it is quantitative feature
# period_data = pd.to_datetime(dataset['Year-Month'])
# period = period_data.map(dt.datetime.toordinal)

# To convert 'Holidays Terms' into one hot vector as it is categorical data
# Then to combine with the above quantitative feature to form training data
for x,y in zip(df['Term'],scaled_price):#df['Price']):
    one_hot = [0]*5
    one_hot[x] = 1
    one_hot.append(y)
    X.append(one_hot)


In [201]:
dataset=[]
label=[]
for z,y in enumerate(X):
    if z<32:
        dataset.append(X[z:z+4])
        label.append(X[z+4][-1])
    

In [203]:
train_data = TensorDataset(torch.tensor(dataset[:26],dtype=torch.float32),
                           torch.tensor(label[:26],dtype=torch.float32))
test_data = TensorDataset(torch.tensor(dataset[26:],dtype=torch.float32),
                         torch.tensor(label[26:],dtype=torch.float32))

In [204]:
batch_size=2
train = DataLoader(train_data,batch_size=batch_size,shuffle=True)
test = DataLoader(test_data,batch_size=10,shuffle=False)

In [212]:
class myLSTM(nn.Module):
    def __init__(self,input_dim,hidden_dim,output_dim):
        super(myLSTM,self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.dropout = nn.Dropout(0.5,inplace=True)
        self.LSTM = nn.LSTM(input_dim,hidden_dim,num_layers=2,bidirectional=True,batch_first=True,dropout=0.5)
        self.linear = nn.Linear(hidden_dim*2,1)
    
    def forward(self,data):
        output,(hidden,cell) = self.LSTM(data)
        hidden2 = self.dropout(torch.cat((hidden[-1,:,:],hidden[-2,:,:]),dim=1))
        out = self.linear(hidden2)
        
        return out
        
        

In [216]:
mymodel = myLSTM(6,3,1)

In [217]:
#loss_fun = F.mse_loss

loss_fun = nn.L1Loss(reduction='mean')
opt = torch.optim.Adam(mymodel.parameters(),lr=0.005)

In [218]:
epoch=2000
for n in range(epoch):
    losses=0
    mymodel.train()
    for x,y in train:
        opt.zero_grad()
        prediction = mymodel(x)
        loss = loss_fun(prediction,y.view(-1,1))
        losses+=loss.item()
        loss.backward()
        opt.step()
    if (n+1)%20==0:
        print('epoch {} training loss: {}'.format(n+1,losses))
    
    if (n+1)%40==0:
        mymodel.eval()
        for a,b in test:
            pred = mymodel(a)
            loss_test = loss_fun(pred,b)
            print('test_loss for epoch {} is {}'.format(n+1,loss_test))

epoch 20 training loss: 2.0222803130745888
epoch 40 training loss: 2.043647348880768
test_loss for epoch 40 is 0.18161650002002716
epoch 60 training loss: 1.768291749060154
epoch 80 training loss: 1.9493482932448387
test_loss for epoch 80 is 0.13469968736171722
epoch 100 training loss: 1.8477842509746552
epoch 120 training loss: 1.6590181812644005
test_loss for epoch 120 is 0.1031985878944397
epoch 140 training loss: 1.8600210100412369
epoch 160 training loss: 1.5221568420529366
test_loss for epoch 160 is 0.10615455359220505
epoch 180 training loss: 1.5318097174167633
epoch 200 training loss: 1.6389653570950031
test_loss for epoch 200 is 0.07475299388170242
epoch 220 training loss: 1.413439393043518
epoch 240 training loss: 1.5920233950018883
test_loss for epoch 240 is 0.08725061267614365
epoch 260 training loss: 1.3124487698078156
epoch 280 training loss: 1.4397313743829727
test_loss for epoch 280 is 0.0779285803437233
epoch 300 training loss: 1.370222620666027
epoch 320 training loss

In [220]:
###this is to check how model performing against unscaled test label
mymodel.eval()
for x,y in test:
    an=mymodel(x)
    ans=an.detach().numpy()
    
    ans = scaler.inverse_transform(ans.reshape(-1,1))
    print(ans)
    print(df.loc[30:,'Price'])

[[61.749924]
 [62.691334]
 [70.74741 ]
 [63.809563]
 [63.536026]
 [79.55868 ]]
30    60.743268
31    58.546726
32    68.253587
33    61.389658
34    62.868093
35    88.348990
Name: Price, dtype: float64


# To make forecast on future data

In [287]:
def feature_generator(x,y):
    dic = {3:1,6:2,9:3,12:4}
    list = [0]*6
    list[-1] = y
    try:
        list[dic[x]] =1
    except:
        list[0] = 1
    return list

In [323]:
current = X[-4:]
mymodel.eval()
year=2019

for m in range(0,2):
    year += 1
    month = 0
    for n in range(1,13):
        latest = current[-4:]
        input = torch.tensor(latest,dtype=torch.float32).view(1,4,6)
        pred = mymodel(input)
        pred_final = pred.detach().numpy()
        current.append(feature_generator(n,pred_final.squeeze(0)))
        month += 1
        forecast = scaler.inverse_transform(pred_final.reshape(-1,1)).squeeze(0)
        print('{}-{} with forecast price of: {}:'.format(year,month,forecast[0]))

    

2020-1 with forecast price of: 61.54224395751953:
2020-2 with forecast price of: 62.57268524169922:
2020-3 with forecast price of: 71.1337890625:
2020-4 with forecast price of: 62.41482162475586:
2020-5 with forecast price of: 63.55333709716797:
2020-6 with forecast price of: 63.33482360839844:
2020-7 with forecast price of: 62.057891845703125:
2020-8 with forecast price of: 61.48052978515625:
2020-9 with forecast price of: 72.29559326171875:
2020-10 with forecast price of: 63.85159683227539:
2020-11 with forecast price of: 63.7620849609375:
2020-12 with forecast price of: 80.38516998291016:
2021-1 with forecast price of: 61.542449951171875:
2021-2 with forecast price of: 62.624568939208984:
2021-3 with forecast price of: 75.51900482177734:
2021-4 with forecast price of: 62.339447021484375:
2021-5 with forecast price of: 63.52370071411133:
2021-6 with forecast price of: 63.372745513916016:
2021-7 with forecast price of: 62.04526901245117:
2021-8 with forecast price of: 61.4827957153320