In [2]:
import pandas as pd
import numpy as np
from plotnine import *
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [3]:
## splits data set by time.
## Adds MnPrFt, which measures the average (delayed minutes/arrival)
## so that we can look at their ratio.
# def temporal(lowT, hiT, df): 
#     ## enumeration of time (easy to work with)
#     df['time'] = (df['year']-2007)*12+df['month']
#     ## date as a string object.
#     df['date'] = df.year.astype(str)+'/'+df.month.astype(str)
#     ## date as a datetime object
#     df['datetime']= pd.to_datetime(df['date'])
#     df['adj_delay'] = df['arr_delay']-df['late_aircraft_delay']
#     df = df.groupby(['datetime']).agg({'arr_flights': sum, 'arr_delay':sum, 'adj_delay': sum}).reset_index()
#     df['MnPrFt'] = df['arr_delay']/df['arr_flights']
#     df['adj_MPF'] = df['adj_delay']/df['arr_flights']
#     return df
##Plotting functions

def plotTLinead(df):
    return ggplot(df, aes(x='datetime', y='arr_delay'))+geom_point(size =0.25)+geom_line(aes(color='arr_delay'), show_legend=False)+theme(text = element_text(size = 5))+labs(x='Date', y='Minutes delayed')+ggtitle('Arrival delays over time')

def plotTLineaf(df):
    return ggplot(df, aes(x='datetime', y='arr_flights'))+geom_point(size = 0.25) +geom_line(aes(color='arr_flights'), show_legend=False)+theme(text = element_text(size = 5))+labs(x='Date', y='Number of arriving flights')+ggtitle('Flight arrivals over time')

def plotTLinempf(df):
    return ggplot(df, aes(x='datetime', y='MnPrFt'))+geom_point(size = 0.25) +geom_line(aes(color='MnPrFt'), show_legend=False)+theme(text = element_text(size = 5))+labs(x='Date', y='Minutes delayed per flight arrival')+ggtitle('Average minutes delayed per arrival over time')

def plotTLadjmpf(df):
    return ggplot(df, aes(x='datetime', y='adjmpf'))+geom_point(size = 0.25) +geom_line(aes(color='adjmpf'), show_legend=False)+theme(text = element_text(size = 5))+labs(x='Date', y='(Adjusted) minutes delayed per flight arrival')+ggtitle('Average adjusted minutes delayed per arrival over time')
    
def plotTLadjad(df):
    return ggplot(df, aes(x='datetime', y='adj_delay'))+geom_point(size = 0.25) +geom_line(aes(color='adj_delay'), show_legend=False)+theme(text = element_text(size = 5))+labs(x='Date', y='(Adjusted) minutes of delay')+ggtitle('Adjusted arrival delays over time')
    

In [4]:
##functions to print a plot of totals with given intervals
##lower and upper determine an (inclusive) integer interval of years (from 2007)
## uses years so it's a bit easier to use than the above

## We don't end up using it as other methods work better.





# def plotyearMnPrFt(df, lower, upper):
#     if (lower > upper): 
#         pass
#     if (lower == upper):
#         return(ggplot(df[(upper)*12:(upper+1)*12], aes(x='time', y='MnPrFt'))+geom_point(size =0.25) +geom_line(aes(color='MnPrFt')))
#     for i in [lower, upper]:
#         return(ggplot(df[(i)*12:(upper+1)*12], aes(x='time', y='MnPrFt'))+geom_point(size =0.25) +geom_line(aes(color='MnPrFt')))
#     pass

# def plotyeararr_delay(df, lower, upper):
#     if (lower > upper): 
#         pass
#     if (lower == upper):
#         return(ggplot(df[(upper)*12:(upper+1)*12], aes(x='time', y='arr_delay'))+geom_point(size =0.25) +geom_line(aes(color='arr_delay')))
#     for i in [lower, upper]:
#         return(ggplot(df[(i)*12:(upper+1)*12], aes(x='time', y='arr_delay'))+geom_point(size =0.25)+geom_line(aes(color='arr_delay')))
#     pass


# def plotyeararr_flights(df, lower, upper):
#     if (lower > upper): 
#         pass
#     if (lower == upper):
#         return(ggplot(df[(upper)*12:(upper+1)*12], aes(x='time', y='arr_flights'))+geom_point(size =0.25) +geom_line(aes(color='arr_flights')))
#     for i in [lower, upper]:
#         return(ggplot(df[(i*12):(upper+1)*12], aes(x='time', y='arr_flights'))+geom_point(size = 0.25) +geom_line(aes(color='arr_flights')))
#     pass


In [5]:
## Data filtering 
#####################

############ airports
## returns airport specific dataframes over given time interval [9,188]
def airports(lowT, hiT, df):
    df['date'] = df.year.astype(str)+'/'+df.month.astype(str)
    ## date as a datetime object
    df['datetime']= pd.to_datetime(df['date'])
    df['adj_delay'] = df['arr_delay']-df['late_aircraft_delay']
    df = df.loc[(df['time']<=hiT) & (df['time']>= lowT)].groupby(['airport']).agg({'arr_flights': sum, 'arr_delay': sum, 'adj_delay':sum}).reset_index()
    ## date as a string object.
    df['mpf'] = df['adj_delay']/df['arr_flights']
    return df

## Use to define Top N (by number of flight arrivals)
## We use this as people are most likely to fly to these airports
def airTopN(lowT, HiT, df, N):
    ap = airports(lowT, HiT, df).sort_values(by='arr_flights', ascending = False)
    return ap.head(N)

## Watch evolution of top values corresponding to airports over time
def airovertime(lowT,hiT, df, N):
    ## Create first element.
    overtime = pd.DataFrame()
    for i in range(lowT,hiT+1):
        temp = airTopN(i, i, df, N) ## include 50 airports with most arrivals that month
        temp['time']=i
        temp = temp.head(N)
        overtime=pd.concat([overtime, temp], axis=0)
    overtime['date']=((overtime.time-1)%12+1).astype(str)+'/'+(2007+overtime.time//12).astype(str)
    overtime['datetime']=pd.to_datetime(overtime.date)
    return overtime.reset_index()

############# carriers
## These give insight into which large carriers are frequently late
## Function below gives carrier specific dataframes over time interval [9,188]
def carriers(lowT, hiT, df): 
    df['time'] = (df['year']-2007)*12+df['month']
    df['adj_delay'] = df['arr_delay']-df['late_aircraft_delay']
    df = df.loc[(df['time']<=hiT) & (df['time']>= lowT)].groupby(['carrier']).agg({'arr_flights': sum, 'adj_delay':sum}).reset_index()
    df['mpf'] = df['adj_delay']/df['arr_flights']
    return df

def carTopN(lowT, HiT, df, N):
    ap = carriers(lowT, HiT, df).sort_values(by='arr_flights', ascending = False)
    return ap.head(N)
## Watch evolution of top values over time corresponding to carriers
def carovertime(lowT,hiT, df, N):
    ## Create first element.
    overtime = pd.DataFrame()
    for i in range(lowT,hiT+1):
        temp = carTopN(i, i, df, max({50,N}))
        temp['time']=i
        temp = temp.head()
        overtime=pd.concat([overtime, temp], axis=0)
    overtime['date']=((overtime.time-1)%12+1).astype(str)+'/'+(2007+overtime.time//12).astype(str)
    overtime['datetime']=pd.to_datetime(overtime.date)
    return overtime.reset_index()

In [6]:
## Plot top N (according to total arrivals) via barplot
## Use max(50,N) to always include top 50 airports 

## Below, N is always the number of rows

############# Airports
def plotairTopNaf(lowT,hiT, df, N):
    df = airTopN(lowT, hiT, df, max(300,N)).head(N)
    return ggplot(df, aes(x='airport', y='arr_flights', fill = 'airport'))+geom_bar(stat='identity')+labs(x='Airport', y= 'Number of arrivals')+ggtitle('Arrivals at each airport over entire dataset')
    
def plotairTopNad(lowT,hiT, df, N):
    df = airTopN(lowT, hiT, df, max(300,N)).head(N)
    return ggplot(df, aes(x='airport', y='adj_delay', fill = 'airport'))+geom_bar(stat='identity') +labs(x='Airport', y= 'Minutes of delay')+ggtitle('Adjusted minutes delayed at each airport')
 
def plotairTopNmpf(lowT,hiT, df, N):
    df = airTopN(lowT, hiT, df, max(300,N)).head(N)
    return ggplot(df, aes(x='airport', y='mpf', fill = 'airport'))+geom_bar(stat='identity')+labs(x='Airport', y= 'Adjusted minutes of delay per arrival')+ggtitle('Adjusted minutes of delay per arrival for airports')

## Line plots with given time interval
def plotairOTaf(lowT,hiT, df, N):
    df=airovertime(lowT,hiT, df,N)
    return ggplot(df, aes(x='datetime', y='arr_flights', fill = 'airport'))+geom_point(size=2)+geom_line(aes(group ='airport', color = 'airport'),size=0.1)+theme(text = element_text(size = 5))+ggtitle('Arrivals at airports over time')+labs(y='Number of arrivals', x='Date')
def plotairOTad(lowT,hiT, df, N):
    df=airovertime(lowT,hiT, df,N)
    return ggplot(df, aes(x='datetime', y='adj_delay', fill = 'airport'))+geom_point(size=2)+geom_line(aes(group ='airport', color = 'airport'),size=0.1)+theme(text = element_text(size = 5))+ggtitle('Arrivals at airports over time')+labs(y='Adjusted minutes delayed', x='Date')
def plotairOTmpf(lowT,hiT, df, N):
    df=airovertime(lowT,hiT, df, N)
    return ggplot(df, aes(x='datetime', y='mpf', fill = 'airport'))+geom_point(size=2)+geom_line(aes(group ='airport', color = 'airport'),size=0.1)+theme(text = element_text(size = 5))+ggtitle('Minutes delayed per arrival at airports over time')+labs(y='Adjusted minutes delayed per arrival', x='Date')


In [7]:
############### Carriers

## Bar plots with given time interval
def plotcarTopNaf(lowT,hiT, df, N):
    df = carTopN(lowT, hiT, df, max(50,N)).head(N)
    return ggplot(df, aes(x='carrier', y='arr_flights', fill = 'carrier'))+geom_bar(stat='identity')+labs(x='Carrier', y= 'Number of arrivals')+ggtitle('Number of arrivals for carriers')
    
def plotcarTopNad(lowT,hiT, df, N):
    df = carTopN(lowT, hiT, df, max(50,N)).head(N)
    return ggplot(df, aes(x='carrier', y='adj_delay', fill = 'carrier'))+geom_bar(stat='identity')+labs(x='Carrier', y= 'Adjusted minutes delayed')+ggtitle('Adjusted minutes delayed for carriers')
 
def plotcarTopNmpf(lowT,hiT, df, N):
    df = carTopN(lowT, hiT, df, max(50,N)).head(N)
    return ggplot(df, aes(x='carrier', y='mpf', fill = 'carrier'))+geom_bar(stat='identity')+labs(x='Carrier', y= 'Adjusted minutes delayed per arrival')+ggtitle('Adjusted minutes delayed per arrival for carriers')

## Line plots with given time interval

def plotcarOTaf(lowT,hiT, df, N):
    df=carovertime(lowT,hiT, df, N)
    return ggplot(df, aes(x='datetime', y='arr_flights', fill = 'carrier'))+geom_point(size=2)+geom_line(aes(group ='carrier', color = 'carrier'),size=0.1)+theme(text = element_text(size = 5)) +ggtitle('Arrivals of carriers over time')+labs(y='Number of arrivals', x='Date')

    
def plotcarOTad(lowT,hiT, df, N):
    df=carovertime(lowT,hiT, df, N)
    return ggplot(df, aes(x='datetime', y='adj_delay', fill = 'carrier'))+geom_point(size=2)+geom_line(aes(group ='carrier', color = 'carrier'),size=0.1)+theme(text = element_text(size = 5))+ggtitle('Delays of carriers in minutes over time')+labs(y='Adjusted minutes delayed', x='Date')

 
def plotcarOTmpf(lowT,hiT, df, N):
    df=carovertime(lowT,hiT, df, N)                    
    return ggplot(df, aes(x='datetime', y='mpf', fill = 'carrier'))+geom_point(size=2)+geom_line(aes(group ='carrier', color = 'carrier'),size=0.1)+theme(text = element_text(size = 5))+ggtitle('Minutes delayed per arrival of carriers over time')+labs(y='Adjusted minutes delayed', x='Date')

