In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
airlines = pd.read_csv('data/airlines.csv')
airlines

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [3]:
airports = pd.read_csv('data/airports.csv')
airports.columns = airports.columns.str.lower()
airports

Unnamed: 0,iata_code,airport,city,state,country,latitude,longitude
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


In [20]:
flights = pd.read_csv('data/flights.csv')
flights.columns = flights.columns.str.lower()
flights = flights.merge(right=airports, how='left', left_on='origin_airport', right_on='iata_code')
flights

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,year,month,day,day_of_week,airline,flight_number,tail_number,origin_airport,destination_airport,scheduled_departure,...,airline_delay,late_aircraft_delay,weather_delay,iata_code,airport,city,state,country,latitude,longitude
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,,,,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,,,,LAX,Los Angeles International Airport,Los Angeles,CA,USA,33.94254,-118.40807
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,,,,SFO,San Francisco International Airport,San Francisco,CA,USA,37.61900,-122.37484
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,,,,LAX,Los Angeles International Airport,Los Angeles,CA,USA,33.94254,-118.40807
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,,,,SEA,Seattle-Tacoma International Airport,Seattle,WA,USA,47.44898,-122.30931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819074,2015,12,31,4,B6,688,N657JB,LAX,BOS,2359,...,,,,LAX,Los Angeles International Airport,Los Angeles,CA,USA,33.94254,-118.40807
5819075,2015,12,31,4,B6,745,N828JB,JFK,PSE,2359,...,,,,JFK,John F. Kennedy International Airport (New Yor...,New York,NY,USA,40.63975,-73.77893
5819076,2015,12,31,4,B6,1503,N913JB,JFK,SJU,2359,...,,,,JFK,John F. Kennedy International Airport (New Yor...,New York,NY,USA,40.63975,-73.77893
5819077,2015,12,31,4,B6,333,N527JB,MCO,SJU,2359,...,,,,MCO,Orlando International Airport,Orlando,FL,USA,28.42889,-81.31603


In [21]:
flights.dtypes

year                     int64
month                    int64
day                      int64
day_of_week              int64
airline                 object
flight_number            int64
tail_number             object
origin_airport          object
destination_airport     object
scheduled_departure      int64
departure_time         float64
departure_delay        float64
taxi_out               float64
wheels_off             float64
scheduled_time         float64
elapsed_time           float64
air_time               float64
distance                 int64
wheels_on              float64
taxi_in                float64
scheduled_arrival        int64
arrival_time           float64
arrival_delay          float64
diverted                 int64
cancelled                int64
cancellation_reason     object
air_system_delay       float64
security_delay         float64
airline_delay          float64
late_aircraft_delay    float64
weather_delay          float64
iata_code               object
airport 

In [34]:
X = flights[flights.columns.difference([
    'departure_delay', # response variable
    'year', # all are 2015
    'flight_number', # irrelevant
    'tail_number', # irrelevant
    'departure_time', # collinearity: scheduled_departure + departure_delay
    'wheels_off', # co: departure_time + taxi_out
    'elapsed_time', # co: scheduled_time + departure_delay
    'air_time', # co: elapsed_time - taxi_out - taxi_in
    'wheels_on', # co: arrival_time - taxi_in
    # leave in arrival_time because of time changes
    'arrival_delay', # co: departure_delay
    'diverted', # occurs after delay, can't be used to predict delay time
    'cancelled', # occurs after delay,
    'cancellation_reason',
    'air_system_delay',
    'security_delay',
    'airline_delay',
    'late_aircraft_delay',
    'weather_delay',
    'iata_code', # same as origin_airport
    'airport', # airport already described in origin_airport
    'country' # all are USA
])]

In [37]:
y = flights.departure_delay

In [36]:
X.head()

Unnamed: 0,airline,arrival_time,city,day,day_of_week,destination_airport,distance,latitude,longitude,month,origin_airport,scheduled_arrival,scheduled_departure,scheduled_time,state,taxi_in,taxi_out
0,AS,408.0,Anchorage,1,4,SEA,1448,61.17432,-149.99619,1,ANC,430,5,205.0,AK,4.0,21.0
1,AA,741.0,Los Angeles,1,4,PBI,2330,33.94254,-118.40807,1,LAX,750,10,280.0,CA,4.0,12.0
2,US,811.0,San Francisco,1,4,CLT,2296,37.619,-122.37484,1,SFO,806,20,286.0,CA,11.0,16.0
3,AA,756.0,Los Angeles,1,4,MIA,2342,33.94254,-118.40807,1,LAX,805,20,285.0,CA,8.0,15.0
4,AS,259.0,Seattle,1,4,ANC,1448,47.44898,-122.30931,1,SEA,320,25,235.0,WA,5.0,11.0


In [40]:
y.head()

0   -11.0
1    -8.0
2    -2.0
3    -5.0
4    -1.0
Name: departure_delay, dtype: float64

In [8]:
flights.corrwith(flights.departure_delay).sort_values(ascending=False)

departure_delay        1.000000
arrival_delay          0.944672
airline_delay          0.621296
late_aircraft_delay    0.554802
weather_delay          0.243532
departure_time         0.171723
wheels_off             0.162582
scheduled_departure    0.110149
scheduled_arrival      0.097566
air_system_delay       0.095917
wheels_on              0.058674
taxi_out               0.058515
arrival_time           0.049236
cancelled              0.033099
longitude              0.031857
elapsed_time           0.030805
scheduled_time         0.027799
distance               0.024106
air_time               0.023495
diverted               0.022963
taxi_in                0.012784
security_delay         0.011877
latitude              -0.003755
flight_number         -0.008665
day_of_week           -0.011510
year                        NaN
dtype: float64

In [9]:
model = smf.ols(formula='departure_delay ~ departure_time + taxi_out + longitude + scheduled_time + month + day', data=flights).fit()
model.summary()

0,1,2,3
Dep. Variable:,departure_delay,R-squared:,0.04
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,4968.0
Date:,"Tue, 28 Feb 2023",Prob (F-statistic):,0.0
Time:,21:28:38,Log-Likelihood:,-26329000.0
No. Observations:,5241724,AIC:,52660000.0
Df Residuals:,5241679,BIC:,52660000.0
Df Model:,44,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.8920,0.149,-46.249,0.000,-7.184,-6.600
month[T.10],2.533e-12,5.18e-14,48.876,0.000,2.43e-12,2.64e-12
month[T.11],-2.8187,0.077,-36.755,0.000,-2.969,-2.668
month[T.12],1.9017,0.076,24.923,0.000,1.752,2.051
month[T.2],1.8620,0.079,23.471,0.000,1.707,2.018
month[T.3],-0.1708,0.075,-2.264,0.024,-0.319,-0.023
month[T.4],-2.0195,0.076,-26.571,0.000,-2.168,-1.871
month[T.5],-0.2455,0.076,-3.250,0.001,-0.394,-0.097
month[T.6],4.0600,0.075,53.805,0.000,3.912,4.208

0,1,2,3
Omnibus:,7122559.352,Durbin-Watson:,1.876
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3535538747.685
Skew:,7.599,Prob(JB):,0.0
Kurtosis:,129.321,Cond. No.,2260000000000000.0


In [11]:
flights.loc[0]

year                                                          2015
month                                                            1
day                                                              1
day_of_week                                                      4
airline                                                         AS
flight_number                                                   98
tail_number                                                 N407AS
origin_airport                                                 ANC
destination_airport                                            SEA
scheduled_departure                                              5
departure_time                                              2354.0
departure_delay                                              -11.0
taxi_out                                                      21.0
wheels_off                                                    15.0
scheduled_time                                               2