# Flight Delay prediction

In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn,optim
import torch.nn.functional as F
import datetime
from torch.utils import data
from random import sample,seed

## Importing Flight Data

In [2]:
flights = pd.read_csv("./data/flights.csv",low_memory=False)
airlines = pd.read_csv("./data/airlines.csv")
airports = pd.read_csv("./data/airports.csv")

## Preprocessing of the data

In [3]:
airports = {ch:i for i,ch in enumerate(flights.ORIGIN_AIRPORT.unique())}
airlines = {ch:i for i,ch in enumerate(airlines.IATA_CODE)}
airports['10666'] =len(airports)

In [4]:
# one hot encoder for day of the week
def one_hot_encode(size,val):
    a = np.zeros((size,),dtype=int)
    a[(val-1)] = 1
    return a

In [5]:
# Date and time preprocess copied from
# link - https://www.kaggle.com/fabiendaniel/predicting-flight-delays-tutorial/data
#_________________________________________________________
# Function that convert the 'HHMM' string to datetime.time
def format_heure(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400: chaine = 0
        chaine = "{0:04d}".format(int(chaine))
        heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return heure

In [6]:
flights['DATE'] = pd.to_datetime(flights[['YEAR','MONTH', 'DAY']])

In [7]:
# REFORMATING THE DATE VARIABLES
#__________________________________________________________________________________
flights['SCHEDULED_DEPARTURE'] = flights['SCHEDULED_DEPARTURE'].apply(format_heure) #create_flight_time(flights, 'SCHEDULED_DEPARTURE')
flights['DEPARTURE_TIME'] = flights['DEPARTURE_TIME'].apply(format_heure)
flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'].apply(format_heure)
flights['ARRIVAL_TIME'] = flights['ARRIVAL_TIME'].apply(format_heure)
#__________________________________________________________________________
flights.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
             'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

Unnamed: 0,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_TIME,ARRIVAL_TIME,DEPARTURE_DELAY,ARRIVAL_DELAY
0,00:05:00,04:30:00,23:54:00,04:08:00,-11.0,-22.0
1,00:10:00,07:50:00,00:02:00,07:41:00,-8.0,-9.0
2,00:20:00,08:06:00,00:18:00,08:11:00,-2.0,5.0
3,00:20:00,08:05:00,00:15:00,07:56:00,-5.0,-9.0
4,00:25:00,03:20:00,00:24:00,02:59:00,-1.0,-21.0
5,00:25:00,06:02:00,00:20:00,06:10:00,-5.0,8.0


In [8]:
# DROPING IRRELEVANT COLUMNS
#_____________________________________________________________________________________
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 
                       'DAY','DATE', 'AIR_SYSTEM_DELAY',
                       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
                       'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
                       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME']
flights.drop(variables_to_remove, axis = 1, inplace = True)
flights = flights[['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
        'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
        'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
        'SCHEDULED_TIME', 'ELAPSED_TIME','MONTH','DAY_OF_WEEK']]
flights[:5]

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,SCHEDULED_TIME,ELAPSED_TIME,MONTH,DAY_OF_WEEK
0,AS,ANC,SEA,00:05:00,23:54:00,-11.0,04:30:00,04:08:00,-22.0,205.0,194.0,1,4
1,AA,LAX,PBI,00:10:00,00:02:00,-8.0,07:50:00,07:41:00,-9.0,280.0,279.0,1,4
2,US,SFO,CLT,00:20:00,00:18:00,-2.0,08:06:00,08:11:00,5.0,286.0,293.0,1,4
3,AA,LAX,MIA,00:20:00,00:15:00,-5.0,08:05:00,07:56:00,-9.0,285.0,281.0,1,4
4,AS,SEA,ANC,00:25:00,00:24:00,-1.0,03:20:00,02:59:00,-21.0,235.0,215.0,1,4


In [None]:
# PROCESSING THE CATEGORICAL DATA
#___________________________________________________________________
flights['AIRLINE'] = flights['AIRLINE'].apply(lambda x: airlines[x])
flights['ORGIN_AIRPORT_V'] = flights['ORIGIN_AIRPORT'].apply(lambda x: airports[x])
flights['DESTINATION__AIRPORT_V'] = flights['DESTINATION_AIRPORT'].apply(lambda x: airports[x])
flights['DAY_OF_WEEK'] = flights['DAY_OF_WEEK'].apply(lambda x: one_hot_encode(7,x))
flights = flights[['AIRLINE','ORGIN_AIRPORT_V',"DESTINATION__AIRPORT_V",'ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]
flights.head()

In [None]:
# CLEAN UP
#______________________________
flights.dropna(inplace =True)
flights = flights.reset_index()
flights = flights[['AIRLINE','ORGIN_AIRPORT_V',"DESTINATION__AIRPORT_V",'ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]
flights.head()

In [None]:
# MAKING THE SEQUENCES OF INPUT DATA
#________________________________________________________________
spequence_length = 10000
output_length = 1
start = 0
data_batch = {}
decoder_input ={}
labels = {}
for i,x in enumerate(range(0,len(flights),spequence_length)):
#     print('id_{}:{}'.format(i,x))
#     y = x+spequence_length
    data_batch['id_{}'.format(i)] = [i for i in range(start,x)]
    air = flights.loc[x:x+output_length-1,'AIRLINE']
    f = flights.loc[x:x+output_length-1,'DEPARTURE_DELAY']
    decoder_input['id_{}'.format(i)] = air.to_list()
    labels['id_{}'.format(i)] = f.to_list()
    start = x

In [None]:
# DATA GENERATOR CLASS FOR EASY ACCESS AND PARALLELIZATION
#___________________________________________________________
class DataGen(data.Dataset):
    def __init__(self,list_id,labels):
        self.list_id = list_id
        self.labels = labels
    def __len__(self):
        return len(self.list_id)
    def __getitem__(self,index):
        ID = self.list_id[index]
        ind = data_batch[ID]
        X = flights.loc[ind,:'DAY_OF_WEEK']
        d_inp = torch.tensor(decoder_input[ID],dtype=torch.long)
        y = torch.tensor(labels[ID],dtype=torch.float)
        return d_inp,y,{'DAY_OF_WEEK':torch.tensor(X.DAY_OF_WEEK.to_list(),dtype=torch.float),
                  'AIRLINE': torch.tensor(X.AIRLINE.to_list(),dtype=torch.long),
                  'ORGIN_AIRPORT_V': torch.tensor(X.ORGIN_AIRPORT_V.to_list(),dtype=torch.long),
                  'DESTINATION__AIRPORT_V': torch.tensor(X.DESTINATION__AIRPORT_V.to_list(),dtype=torch.long)}

In [None]:
# PARAMETER FOR THE GENERATOR
#_____________________________
params = {'batch_size': 64,
          'shuffle': False,
          'num_workers': 6}

In [None]:
# SPLITING UP THE DATA INTO TRAINING AND VALIDATION
#_____________________________________________________
seed(1234)
data_ids=list(data_batch.keys())
data_ids.pop(0)
length = int(len(data_ids)*0.75)
training_ids = sample(data_ids,length)
validation_ids = []
for i in list(data_batch.keys()):
    if i not in training_ids:
        validation_ids.append(i)

In [None]:
#CREATING DATALOADERS
#_______________________________________________________________
training_set = DataGen(training_ids, labels)
training_generator = data.DataLoader(training_set, **params)
validation_set = DataGen(validation_ids, labels)
validation_generator = data.DataLoader(validation_set,batch_size=2,shuffle=False)

# MODEL DEFINITION

In [None]:
cuda =torch.cuda.is_available()
device = torch.device('cuda:0'if torch.cuda.is_available() else "cpu")
cuda

In [None]:
# MODEL DEFINITION
#_________________
class model(nn.Module):
    
    def __init__(self,airline_embed_size=4,airport_embed_size=300,hidden_size=83,no_of_layers=2,dropout=0.25,linear_size=512):
        super(model,self).__init__()
        
        # DEFINING THE VARIABLES AND PARAMETERS
        self.airline_embed_size = airline_embed_size
        self.airport_embed_size = airport_embed_size
        self.hidden_size = hidden_size
        self.no_of_layers =no_of_layers
        
        # MODEL ELEMENTS
        self.airline_embed = nn.Embedding(len(airlines),airline_embed_size)
        self.airport_embed = nn.Embedding(len(airports),airport_embed_size) 
        self.conv1 = nn.Conv1d(10000,2048,kernel_size=1)
        self.conv2 = nn.Conv1d(2048,1024,kernel_size=2,stride=2,padding=1)
        self.conv3 = nn.Conv1d(1024,1024,kernel_size=2,stride=2,padding=1)
        self.conv4 = nn.Conv1d(1024,2048,1)
        self.conv5 = nn.Conv1d(2048,1,1)
        # HIDDEN SIZE IS CALCULATED BASED ON DOWN SIZING THROUGH CONVOLUTION
        self.bnorm = nn.BatchNorm1d(num_features=linear_size)
        self.linear_1 = nn.Linear(hidden_size,linear_size)
        self.linear_2 = nn.Linear(linear_size,linear_size)
        self.out = nn.Linear(linear_size,1)
        self.drop = nn.Dropout(p=dropout)
        
    def forward(self,x,decode):
#         print(x['AIRLINE'].is_cuda)
        x['DAY_OF_WEEK'] = x['DAY_OF_WEEK'].to(device)
        x['AIRLINE'] = x['AIRLINE'].to(device)
        x['AIRLINE']=self.airline_embed(x['AIRLINE'])
        x['DESTINATION__AIRPORT_V'] =  x['DESTINATION__AIRPORT_V'].to(device)
        x['DESTINATION__AIRPORT_V'] =self.airport_embed(x['DESTINATION__AIRPORT_V'])
        x['ORGIN_AIRPORT_V'] = x['ORGIN_AIRPORT_V'].to(device)
        x['ORGIN_AIRPORT_V'] = self.airport_embed(x['ORGIN_AIRPORT_V'])
        input_ =torch.cat((x['DAY_OF_WEEK'],x['AIRLINE'],(x['DESTINATION__AIRPORT_V']-x['ORGIN_AIRPORT_V'])),2)
        # CONCATINATED DAY OF THE WEEK, DELAY, AIRLINE AND DESTINATION-ORIGIN(GRAPH REP)
        
        decode = self.airline_embed(decode)
        out = F.relu(self.conv1(input_))
        out = F.relu(self.conv2(out))
        out = F.relu(self.conv3(out))
        out = F.relu(self.conv4(out))
        out = F.relu(self.conv5(out))
        out = out.view(out.shape[0],-1)
        decode = decode.view(decode.shape[0],-1)
        out = torch.cat((out,decode),1)
        out = self.drop(F.relu(self.bnorm(self.linear_1(out))))
        out = self.drop(F.relu(self.bnorm(self.linear_2(out))))
        out = self.out(out)
        
        return out
    

In [None]:
#LOSS AND OPTIMISATION FUNCTIONS
conv = model()
criterian = nn.SmoothL1Loss() 
# It uses a squared term if the absolute error falls below 1 and an absolute term otherwise.
optimizer = optim.Adam(conv.parameters(),lr =0.001)

# Training

In [None]:
epoch = 25
conv.to(device)

for e in range(0,epoch):
    running_loss =0
    conv.train()
    count =0
    for decode,y,x in training_generator: 
        decode = decode.to(device)
        y = y.to(device)
        out = conv(x,decode)
        loss = criterian(out,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss +=loss.item()
        count =count+1
    else:
        if((e+1)%5==0):
            print(f'The Training loss = {running_loss/count}')
