# Flight Delay prediction

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import datetime
from torch.utils import data
from random import sample,seed

## Importing Flight Data

In [2]:
flights = pd.read_csv("./data/flights.csv",low_memory=False)
airlines = pd.read_csv("./data/airlines.csv")
airports = pd.read_csv("./data/airports.csv")

## Preprocessing of the data

In [3]:
airports = {ch:i for i,ch in enumerate(flights.ORIGIN_AIRPORT.unique())}
airlines = {ch:i for i,ch in enumerate(airlines.IATA_CODE)}
airports['10666'] =len(airports)

In [4]:
# one hot encoder for day of the week
def one_hot_encode(size,val):
    a = np.zeros((size,),dtype=int)
    a[(val-1)] = 1
    return a

In [5]:
# Date and time preprocess copied from
# link - https://www.kaggle.com/fabiendaniel/predicting-flight-delays-tutorial/data
#_________________________________________________________
# Function that convert the 'HHMM' string to datetime.time
def format_heure(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400: chaine = 0
        chaine = "{0:04d}".format(int(chaine))
        heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return heure

In [6]:
flights['DATE'] = pd.to_datetime(flights[['YEAR','MONTH', 'DAY']])

In [7]:
# REFORMATING THE DATE VARIABLES
#__________________________________________________________________________________
flights['SCHEDULED_DEPARTURE'] = flights['SCHEDULED_DEPARTURE'].apply(format_heure) #create_flight_time(flights, 'SCHEDULED_DEPARTURE')
flights['DEPARTURE_TIME'] = flights['DEPARTURE_TIME'].apply(format_heure)
flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'].apply(format_heure)
flights['ARRIVAL_TIME'] = flights['ARRIVAL_TIME'].apply(format_heure)
#__________________________________________________________________________
flights.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
             'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

Unnamed: 0,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_TIME,ARRIVAL_TIME,DEPARTURE_DELAY,ARRIVAL_DELAY
0,00:05:00,04:30:00,23:54:00,04:08:00,-11.0,-22.0
1,00:10:00,07:50:00,00:02:00,07:41:00,-8.0,-9.0
2,00:20:00,08:06:00,00:18:00,08:11:00,-2.0,5.0
3,00:20:00,08:05:00,00:15:00,07:56:00,-5.0,-9.0
4,00:25:00,03:20:00,00:24:00,02:59:00,-1.0,-21.0
5,00:25:00,06:02:00,00:20:00,06:10:00,-5.0,8.0


In [8]:
# DROPING IRRELEVANT COLUMNS
#_____________________________________________________________________________________
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 
                       'DAY','DATE', 'AIR_SYSTEM_DELAY',
                       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
                       'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
                       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME']
flights.drop(variables_to_remove, axis = 1, inplace = True)
flights = flights[['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
        'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
        'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
        'SCHEDULED_TIME', 'ELAPSED_TIME','MONTH','DAY_OF_WEEK']]
flights[:5]

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,SCHEDULED_TIME,ELAPSED_TIME,MONTH,DAY_OF_WEEK
0,AS,ANC,SEA,00:05:00,23:54:00,-11.0,04:30:00,04:08:00,-22.0,205.0,194.0,1,4
1,AA,LAX,PBI,00:10:00,00:02:00,-8.0,07:50:00,07:41:00,-9.0,280.0,279.0,1,4
2,US,SFO,CLT,00:20:00,00:18:00,-2.0,08:06:00,08:11:00,5.0,286.0,293.0,1,4
3,AA,LAX,MIA,00:20:00,00:15:00,-5.0,08:05:00,07:56:00,-9.0,285.0,281.0,1,4
4,AS,SEA,ANC,00:25:00,00:24:00,-1.0,03:20:00,02:59:00,-21.0,235.0,215.0,1,4


In [9]:
# PROCESSING THE CATEGORICAL DATA
#___________________________________________________________________
flights['AIRLINE'] = flights['AIRLINE'].apply(lambda x: airlines[x])
flights['ORGIN_AIRPORT_V'] = flights['ORIGIN_AIRPORT'].apply(lambda x: airports[x])
flights['DESTINATION__AIRPORT_V'] = flights['DESTINATION_AIRPORT'].apply(lambda x: airports[x])
flights['DAY_OF_WEEK'] = flights['DAY_OF_WEEK'].apply(lambda x: one_hot_encode(7,x))
flights = flights[['AIRLINE','ORGIN_AIRPORT_V',"DESTINATION__AIRPORT_V",'ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]
flights.head()

Unnamed: 0,AIRLINE,ORGIN_AIRPORT_V,DESTINATION__AIRPORT_V,ARRIVAL_DELAY,DAY_OF_WEEK,DEPARTURE_DELAY
0,6,0,3,-22.0,"[0, 0, 0, 1, 0, 0, 0]",-11.0
1,1,1,77,-9.0,"[0, 0, 0, 1, 0, 0, 0]",-8.0
2,2,2,97,5.0,"[0, 0, 0, 1, 0, 0, 0]",-2.0
3,1,1,84,-9.0,"[0, 0, 0, 1, 0, 0, 0]",-5.0
4,6,3,0,-21.0,"[0, 0, 0, 1, 0, 0, 0]",-1.0


In [10]:
# CLEAN UP
#______________________________
flights.dropna(inplace =True)
flights = flights.reset_index()
flights = flights[['AIRLINE','ORGIN_AIRPORT_V',"DESTINATION__AIRPORT_V",'ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]
flights.head()

Unnamed: 0,AIRLINE,ORGIN_AIRPORT_V,DESTINATION__AIRPORT_V,ARRIVAL_DELAY,DAY_OF_WEEK,DEPARTURE_DELAY
0,6,0,3,-22.0,"[0, 0, 0, 1, 0, 0, 0]",-11.0
1,1,1,77,-9.0,"[0, 0, 0, 1, 0, 0, 0]",-8.0
2,2,2,97,5.0,"[0, 0, 0, 1, 0, 0, 0]",-2.0
3,1,1,84,-9.0,"[0, 0, 0, 1, 0, 0, 0]",-5.0
4,6,3,0,-21.0,"[0, 0, 0, 1, 0, 0, 0]",-1.0


In [12]:
# MAKING THE SEQUENCES OF INPUT DATA
#________________________________________________________________
spequence_length = 10000
output_length = 10
start = 0
data_batch = {}
decoder_input ={}
labels = {}
for i,x in enumerate(range(0,len(flights),spequence_length)):
#     print('id_{}:{}'.format(i,x))
#     y = x+spequence_length
    data_batch['id_{}'.format(i)] = [i for i in range(start,x)]
    air = flights.loc[x:x+output_length-1,'AIRLINE']
    f = flights.loc[x:x+output_length-1,'DEPARTURE_DELAY']
    decoder_input['id_{}'.format(i)] = air.to_list()
    labels['id_{}'.format(i)] = f.to_list()
    start = x

In [13]:
len(labels['id_0'])

10

In [14]:
decoder_input['id_0']

[6, 1, 2, 1, 6, 9, 7, 2, 1, 9]

In [15]:
labels.keys()

dict_keys(['id_0', 'id_1', 'id_2', 'id_3', 'id_4', 'id_5', 'id_6', 'id_7', 'id_8', 'id_9', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'id_39', 'id_40', 'id_41', 'id_42', 'id_43', 'id_44', 'id_45', 'id_46', 'id_47', 'id_48', 'id_49', 'id_50', 'id_51', 'id_52', 'id_53', 'id_54', 'id_55', 'id_56', 'id_57', 'id_58', 'id_59', 'id_60', 'id_61', 'id_62', 'id_63', 'id_64', 'id_65', 'id_66', 'id_67', 'id_68', 'id_69', 'id_70', 'id_71', 'id_72', 'id_73', 'id_74', 'id_75', 'id_76', 'id_77', 'id_78', 'id_79', 'id_80', 'id_81', 'id_82', 'id_83', 'id_84', 'id_85', 'id_86', 'id_87', 'id_88', 'id_89', 'id_90', 'id_91', 'id_92', 'id_93', 'id_94', 'id_95', 'id_96', 'id_97', 'id_98', 'id_99', 'id_100', 'id_101', 'id_102', 'id_103', 'id_104', 'id_105', 'id_106', 'id_107', 'id_108', 'id_109',

In [16]:
labels['id_81']

[-5.0, -2.0, 1.0, -5.0, -3.0, -9.0, -8.0, -2.0, 11.0, -2.0]

In [17]:
# DATA GENERATOR CLASS FOR EASY ACCESS AND PARALLELIZATION
#___________________________________________________________
class DataGen(data.Dataset):
    def __init__(self,list_id,labels):
        self.list_id = list_id
        self.labels = labels
    def __len__(self):
        return len(self.list_id)
    def __getitem__(self,index):
        ID = self.list_id[index]
        ind = data_batch[ID]
        X = flights.loc[ind,:'DAY_OF_WEEK']
        d_inp = torch.tensor(decoder_input[ID],dtype=torch.long)
        y = torch.tensor(labels[ID],dtype=torch.float)
        return d_inp,y,{'DAY_OF_WEEK':torch.tensor(X.DAY_OF_WEEK.to_list(),dtype=torch.float),
                  'ARRIVAL_DELAY':torch.tensor(X.ARRIVAL_DELAY.to_list(),dtype=torch.float),
                  'AIRLINE': torch.tensor(X.AIRLINE.to_list(),dtype=torch.long),
                  'ORGIN_AIRPORT_V': torch.tensor(X.ORGIN_AIRPORT_V.to_list(),dtype=torch.long),
                  'DESTINATION__AIRPORT_V': torch.tensor(X.DESTINATION__AIRPORT_V.to_list(),dtype=torch.long)}

In [18]:
# PARAMETER FOR THE GENERATOR
#_____________________________
params = {'batch_size': 2,
          'shuffle': False,
          'num_workers': 6}

In [19]:
# SPLITING UP THE DATA INTO TRAINING AND VALIDATION
#_____________________________________________________
seed(1234)
length = int(len(data_batch.keys())*0.75)
training_ids = sample(list(data_batch.keys()),length)
validation_ids = []
for i in list(data_batch.keys()):
    if i not in training_ids:
        validation_ids.append(i)

In [20]:
#CREATING DATALOADERS
#_______________________________________________________________
training_set = DataGen(training_ids, labels)
training_generator = data.DataLoader(training_set, **params)
validation_set = DataGen(validation_ids, labels)
validation_generator = data.DataLoader(validation_set, **params)

In [21]:
d_inp,y,x = next(iter(training_generator))# training_set.__getitem__(0)

In [22]:
# REQUIRED RESAHPE FOR CONCATINATE
# x['ARRIVAL_DELAY'] = x['ARRIVAL_DELAY'].reshape(params['batch_size'],spequence_length,1)
x

{'DAY_OF_WEEK': tensor([[[0., 1., 0.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          [0., 1., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.],
          [0., 0., 1.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          [0., 0., 0.,  ..., 1., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 1., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.],
          [0., 0., 0.,  ..., 0., 1., 0.]]]),
 'ARRIVAL_DELAY': tensor([[ -9., -16.,  14.,  ..., -21.,  -3.,  32.],
         [ 23., -12., -17.,  ..., -10.,   2.,  -1.]]),
 'AIRLINE': tensor([[10, 10, 10,  ...,  1,  6,  6],
         [ 5,  7,  0,  ...,  8,  8,  8]]),
 'ORGIN_AIRPORT_V': tensor([[340, 434, 419,  ..., 366, 327, 322],
         [  1,   4, 161,  ...,   5, 155, 155]]),
 'DESTINATION__AIRPORT_V': tensor([[461, 429, 354,  ..., 492, 380, 352],
         [ 10,  59,  38,  ...,   4,  

In [23]:
y.shape

torch.Size([2, 10])

In [26]:
d_inp.shape

torch.Size([2, 10])

In [27]:
embeds = nn.Embedding(len(airlines),4)

In [28]:
embeds(x['AIRLINE']).shape

torch.Size([2, 10000, 4])

# MODEL DEFINITION

In [50]:
# MODEL DEFINITION
#_________________
class model(nn.Module):
    
    def __init__(self,airline_embed_size,airport_embed_size,hidden_size,no_of_layers=2,dropout=0.25,linear_size=512):
        super(model,self).__init__()
        
        # DEFINING THE VARIABLES AND PARAMETERS
        self.airline_embed_size = airline_embed_size
        self.airport_embed_size = airport_embed_size
        self.hidden_size = hidden_size
        self.no_of_layers =no_of_layers
        
        # MODEL ELEMENTS
        self.airline_embed = nn.Embedding(len(airlines),airline_embed_size)
        self.airport_embed = nn.Embedding(len(airports),airport_embed_size) 
        self.lstm_encoder = nn.LSTM((airline_embed_size+airport_embed_size+7),hidden_size,no_of_layers,dropout=dropout,batch_first=True)
        self.lstm_decoder = nn.LSTM(airline_embed_size,hidden_size,no_of_layers,dropout=dropout,batch_first=True)
        self.linear_1 = nn.Linear(hidden_size,linear_size)
        self.linear_2 = nn.Linear(linear_size,linear_size)
        self.out = nn.Linear(linear_size,1)
        self.drop = nn.Dropout(p=dropout)
        
    def forward(self,x,hidden,decode):
        input_ =torch.cat((x['DAY_OF_WEEK'],self.airline_embed(x['AIRLINE']),(self.airport_embed(x['DESTINATION__AIRPORT_V'])-self.airport_embed(x['ORGIN_AIRPORT_V']))),2)
        # CONCATINATED DAY OF THE WEEK, DELAY, AIRLINE AND DESTINATION-ORIGIN(GRAPH REP)
        decode = self.airline_embed(decode)
        o, hidden = self.lstm_encoder(x,hidden)
        out,hidden = self.lstm_decoder(decode,hidden)
        return out, hidden
    def init_hidden(self,batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.no_of_layers,batch_size,self.hidden_size).zero_(),
               weight.new(self.no_of_layers,batch_size,self.hidden_size).zero_())

In [51]:
test = model(4,300,256)

In [52]:
test

model(
  (airline_embed): Embedding(14, 4)
  (airport_embed): Embedding(629, 300)
  (lstm_encoder): LSTM(311, 256, num_layers=2, batch_first=True, dropout=0.25)
  (lstm_decoder): LSTM(4, 256, num_layers=2, batch_first=True, dropout=0.25)
  (linear_1): Linear(in_features=256, out_features=512, bias=True)
  (linear_2): Linear(in_features=512, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=1, bias=True)
  (drop): Dropout(p=0.25, inplace=False)
)

In [53]:
h = test.init_hidden(2)
y,h = test(x,h,d_inp)

AttributeError: 'dict' object has no attribute 'size'