# Flight Delay prediction

In [1]:
import numpy as np
import pandas as pd
import torch
import datetime
from torch.utils import data
from random import sample,seed

## Importing Flight Data

In [2]:
flights = pd.read_csv("./data/flights.csv")
airlines = pd.read_csv("./data/airlines.csv")
airports = pd.read_csv("./data/airports.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Preprocessing of the data

In [3]:
airports = {ch:i for i,ch in enumerate(flights.ORIGIN_AIRPORT.unique())}
airlines = {ch:i for i,ch in enumerate(airlines.IATA_CODE)}
airports['10666'] =len(airports)

In [4]:
# one hot encoder for day of the week
def one_hot_encode(size,val):
    a = np.zeros((size,),dtype=int)
    a[(val-1)] = 1
    return a

In [5]:
# Date and time preprocess copied from
# link - https://www.kaggle.com/fabiendaniel/predicting-flight-delays-tutorial/data
#_________________________________________________________
# Function that convert the 'HHMM' string to datetime.time
def format_heure(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400: chaine = 0
        chaine = "{0:04d}".format(int(chaine))
        heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return heure

In [6]:
flights['DATE'] = pd.to_datetime(flights[['YEAR','MONTH', 'DAY']])

In [7]:
# REFORMATING THE DATE VARIABLES
#__________________________________________________________________________________
flights['SCHEDULED_DEPARTURE'] = flights['SCHEDULED_DEPARTURE'].apply(format_heure) #create_flight_time(flights, 'SCHEDULED_DEPARTURE')
flights['DEPARTURE_TIME'] = flights['DEPARTURE_TIME'].apply(format_heure)
flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'].apply(format_heure)
flights['ARRIVAL_TIME'] = flights['ARRIVAL_TIME'].apply(format_heure)
#__________________________________________________________________________
flights.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME',
             'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']]

Unnamed: 0,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_TIME,ARRIVAL_TIME,DEPARTURE_DELAY,ARRIVAL_DELAY
0,00:05:00,04:30:00,23:54:00,04:08:00,-11.0,-22.0
1,00:10:00,07:50:00,00:02:00,07:41:00,-8.0,-9.0
2,00:20:00,08:06:00,00:18:00,08:11:00,-2.0,5.0
3,00:20:00,08:05:00,00:15:00,07:56:00,-5.0,-9.0
4,00:25:00,03:20:00,00:24:00,02:59:00,-1.0,-21.0
5,00:25:00,06:02:00,00:20:00,06:10:00,-5.0,8.0


In [8]:
# DROPING IRRELEVANT COLUMNS
#_____________________________________________________________________________________
variables_to_remove = ['TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 
                       'DAY','DATE', 'AIR_SYSTEM_DELAY',
                       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
                       'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
                       'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME']
flights.drop(variables_to_remove, axis = 1, inplace = True)
flights = flights[['AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
        'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
        'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY',
        'SCHEDULED_TIME', 'ELAPSED_TIME','MONTH','DAY_OF_WEEK']]
flights[:5]

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,SCHEDULED_TIME,ELAPSED_TIME,MONTH,DAY_OF_WEEK
0,AS,ANC,SEA,00:05:00,23:54:00,-11.0,04:30:00,04:08:00,-22.0,205.0,194.0,1,4
1,AA,LAX,PBI,00:10:00,00:02:00,-8.0,07:50:00,07:41:00,-9.0,280.0,279.0,1,4
2,US,SFO,CLT,00:20:00,00:18:00,-2.0,08:06:00,08:11:00,5.0,286.0,293.0,1,4
3,AA,LAX,MIA,00:20:00,00:15:00,-5.0,08:05:00,07:56:00,-9.0,285.0,281.0,1,4
4,AS,SEA,ANC,00:25:00,00:24:00,-1.0,03:20:00,02:59:00,-21.0,235.0,215.0,1,4


In [None]:
# PROCESSING THE CATEGORICAL DATA
#___________________________________________________________________
flights['AIRLINE'] = flights['AIRLINE'].apply(lambda x: airlines[x])
flights['ORGIN_AIRPORT_V'] = flights['ORIGIN_AIRPORT'].apply(lambda x: airports[x])
flights['DESTINATION__AIRPORT_V'] = flights['DESTINATION_AIRPORT'].apply(lambda x: airports[x])
flights['DAY_OF_WEEK'] = flights['DAY_OF_WEEK'].apply(lambda x: one_hot_encode(7,x))
flights = flights[['AIRLINE','ORGIN_AIRPORT_V',"DESTINATION__AIRPORT_V",'ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]
flights.head()

In [13]:
# flights = flights[['AIRLINE', 'ORIGIN_AIRPORT_V', 'DESTINATION_AIRPORT_V','ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]

In [14]:
flights.head()


Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,SCHEDULED_TIME,ELAPSED_TIME,MONTH,DAY_OF_WEEK,ORGIN_AIRPORT_V,DESTINATION__AIRPORT_V
0,6,ANC,SEA,00:05:00,23:54:00,-11.0,04:30:00,04:08:00,-22.0,205.0,194.0,1,"[1, 0, 0, 0, 0, 0, 1]",0,3
1,1,LAX,PBI,00:10:00,00:02:00,-8.0,07:50:00,07:41:00,-9.0,280.0,279.0,1,"[1, 0, 0, 0, 0, 0, 1]",1,77
2,2,SFO,CLT,00:20:00,00:18:00,-2.0,08:06:00,08:11:00,5.0,286.0,293.0,1,"[1, 0, 0, 0, 0, 0, 1]",2,97
3,1,LAX,MIA,00:20:00,00:15:00,-5.0,08:05:00,07:56:00,-9.0,285.0,281.0,1,"[1, 0, 0, 0, 0, 0, 1]",1,84
4,6,SEA,ANC,00:25:00,00:24:00,-1.0,03:20:00,02:59:00,-21.0,235.0,215.0,1,"[1, 0, 0, 0, 0, 0, 1]",3,0


In [16]:
# flights = flights[['AIRLINE','ORGIN_AIRPORT_V',"DESTINATION__AIRPORT_V",'ARRIVAL_DELAY','DAY_OF_WEEK','DEPARTURE_DELAY']]

In [17]:
# CLEAN UP
#______________________________
flights.dropna(inplace =True)
flights = flights.reset_index()
flights[:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,index,AIRLINE,ORGIN_AIRPORT_V,DESTINATION__AIRPORT_V,ARRIVAL_DELAY,DAY_OF_WEEK,DEPARTURE_DELAY
0,0,6,0,3,-22.0,"[1, 0, 0, 0, 0, 0, 1]",-11.0
1,1,1,1,77,-9.0,"[1, 0, 0, 0, 0, 0, 1]",-8.0
2,2,2,2,97,5.0,"[1, 0, 0, 0, 0, 0, 1]",-2.0
3,3,1,1,84,-9.0,"[1, 0, 0, 0, 0, 0, 1]",-5.0
4,4,6,3,0,-21.0,"[1, 0, 0, 0, 0, 0, 1]",-1.0


In [18]:
# MAKING THE SEQUENCES OF INPUT DATA
#________________________________________________________________
spequence_length = 200
start = 0
data_batch = {}
labels = {}
for i,x in enumerate(range(0,len(flights),spequence_length)):
#     print('id_{}:{}'.format(i,x))
    data_batch['id_{}'.format(i)] = [i for i in range(start,x)]
    labels['id_{}'.format(i)] = flights.loc[x,'DEPARTURE_DELAY']
    start = x

In [53]:
# DATA GENERATOR CLASS FOR EASY ACCESS AND PARALLELIZATION
#___________________________________________________________
class DataGen(data.Dataset):
    def __init__(self,list_id,labels):
        self.list_id = list_id
        self.labels = labels
    def __len__(self):
        return len(self.list_id)
    def __getitem__(self,index):
        ID = self.list_id[index]
        ind = data_batch[ID]
        X = flights.loc[ind,:'DAY_OF_WEEK']
        y = labels[ID]
        return y,{'DAY_OF_WEEK':torch.tensor(X.DAY_OF_WEEK.to_list()),
                  'ARRIVAL_DELAY':torch.tensor(X.ARRIVAL_DELAY.to_list(),dtype=torch.long)}

In [54]:
# PARAMETER FOR THE GENERATOR
#_____________________________
params = {'batch_size': 3,
          'shuffle': False,
          'num_workers': 6}

In [55]:
# SPLITING UP THE DATA INTO TRAINING AND VALIDATION
#_____________________________________________________
seed(1234)
length = int(len(data_batch.keys())*0.75)
training_ids = sample(list(data_batch.keys()),length)
validation_ids = []
for i in list(data_batch.keys()):
    if i not in training_ids:
        validation_ids.append(i)

In [56]:
#CREATING DATALOADERS
#_______________________________________________________________
training_set = DataGen(training_ids, labels)
training_generator = data.DataLoader(training_set, **params)
validation_set = DataGen(validation_ids, labels)
validation_generator = data.DataLoader(validation_set, **params)

In [57]:
y,x = next(iter(training_generator))# training_set.__getitem__(0)

In [58]:
x

{'DAY_OF_WEEK': tensor([[[1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          ...,
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1]],
 
         [[1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          ...,
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1]],
 
         [[1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          ...,
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1],
          [1, 0, 0,  ..., 0, 0, 1]]]),
 'ARRIVAL_DELAY': tensor([[ 21, -15, -12,  -7, -17,  16,  -7, -18, -13,   5,  -6,  -5, -18, -16,
          -17,   8,   7,  -5, -21, -22,  -8,  -9,  14,  -8, -10, -18,  85,  -7,
           11,  10, -22,  -8,  -1, -12, -24, -21,  -6, -17,  -8,  -4, -12, -27,
            0, -19, -15,  

In [60]:
x['ARRIVAL_DELAY'].shape

torch.Size([3, 200])

In [61]:
x['ARRIVAL_DELAY'] = x['ARRIVAL_DELAY'].reshape(3,200,1)

In [62]:
torch.cat((x['DAY_OF_WEEK'][0],x['ARRIVAL_DELAY'][0]),1)

tensor([[  1,   0,   0,  ...,   0,   1,  21],
        [  1,   0,   0,  ...,   0,   1, -15],
        [  1,   0,   0,  ...,   0,   1, -12],
        ...,
        [  1,   0,   0,  ...,   0,   1,   2],
        [  1,   0,   0,  ...,   0,   1,  -5],
        [  1,   0,   0,  ...,   0,   1,  -7]])

In [65]:
x['ARRIVAL_DELAY'][0].shape

torch.Size([200, 1])

In [66]:
x['DAY_OF_WEEK'][0].shape

torch.Size([200, 7])

In [67]:
torch.cat((torch.tensor([[1,2],[2,1],[3,1]]),torch.tensor([[1,2,3],[1,2,3],[1,2,3]])),1)

tensor([[1, 2, 1, 2, 3],
        [2, 1, 1, 2, 3],
        [3, 1, 1, 2, 3]])

In [68]:
torch.tensor([[1],[2],[3]]).shape

torch.Size([3, 1])

In [69]:
torch.tensor([[1,2,3],[1,2,3],[1,2,3]]).shape

torch.Size([3, 3])

In [80]:
torch.cat((x['DAY_OF_WEEK'],x['ARRIVAL_DELAY']),2)

tensor([[[  1,   0,   0,  ...,   0,   1,  21],
         [  1,   0,   0,  ...,   0,   1, -15],
         [  1,   0,   0,  ...,   0,   1, -12],
         ...,
         [  1,   0,   0,  ...,   0,   1,   2],
         [  1,   0,   0,  ...,   0,   1,  -5],
         [  1,   0,   0,  ...,   0,   1,  -7]],

        [[  1,   0,   0,  ...,   0,   1,  16],
         [  1,   0,   0,  ...,   0,   1,  28],
         [  1,   0,   0,  ...,   0,   1,   5],
         ...,
         [  1,   0,   0,  ...,   0,   1,  17],
         [  1,   0,   0,  ...,   0,   1,   8],
         [  1,   0,   0,  ...,   0,   1,  -5]],

        [[  1,   0,   0,  ...,   0,   1, -16],
         [  1,   0,   0,  ...,   0,   1, 134],
         [  1,   0,   0,  ...,   0,   1, -13],
         ...,
         [  1,   0,   0,  ...,   0,   1,   6],
         [  1,   0,   0,  ...,   0,   1,  19],
         [  1,   0,   0,  ...,   0,   1,  -3]]])

In [73]:
x['ARRIVAL_DELAY'][:1].shape

torch.Size([1, 200, 1])

In [74]:
x['DAY_OF_WEEK'][:1].shape

torch.Size([1, 200, 7])