### Importation of Libraries 

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt

### Data Reading and Wrangling

##### Airlines and Airport Datasets

In [2]:
airlines = pd.read_csv('data/airlines.csv')
airlines

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways
5,OO,Skywest Airlines Inc.
6,AS,Alaska Airlines Inc.
7,NK,Spirit Air Lines
8,WN,Southwest Airlines Co.
9,DL,Delta Air Lines Inc.


In [3]:
airports = pd.read_csv('data/airports.csv')
airports.columns = airports.columns.str.lower()
airports # This dataset will be merged with the flights dataset since it contains information 
         # that is of importance to our analysis and ultimate regression model that will be based 
         # off of the flights dataset. 

Unnamed: 0,iata_code,airport,city,state,country,latitude,longitude
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447
...,...,...,...,...,...,...,...
317,WRG,Wrangell Airport,Wrangell,AK,USA,56.48433,-132.36982
318,WYS,Westerly State Airport,West Yellowstone,MT,USA,44.68840,-111.11764
319,XNA,Northwest Arkansas Regional Airport,Fayetteville/Springdale/Rogers,AR,USA,36.28187,-94.30681
320,YAK,Yakutat Airport,Yakutat,AK,USA,59.50336,-139.66023


##### Flights Dataset (and Merging of aforementioned dataset with Airports dataset)

In [None]:
flights = pd.read_csv('data/flights.csv')
flights.columns = flights.columns.str.lower()
flights = flights.merge(right=airports, how='left', left_on='origin_airport', right_on='iata_code')

  flights = pd.read_csv('data/flights.csv')


In [None]:
X = flights[flights.columns.difference([
    'year', # all are 2015
    'flight_number', # irrelevant
    'tail_number', # irrelevant
    'departure_time', # collinearity: scheduled_departure + departure_delay
    'wheels_off', # co: departure_time + taxi_out
    'elapsed_time', # co: scheduled_time + departure_delay
    'air_time', # co: elapsed_time - taxi_out - taxi_in
    'wheels_on', # co: arrival_time - taxi_in
    # leave in arrival_time because of time changes
    'arrival_delay', # co: departure_delay
    'diverted', # occurs after delay, can't be used to predict delay time
    'cancelled', # occurs after delay,
    'cancellation_reason',
    'air_system_delay',
    'security_delay',
    'airline_delay',
    'late_aircraft_delay',
    'weather_delay',
    'iata_code', # same as origin_airport
    'airport', # airport already described in origin_airport
    'country' # all are USA
])]

### Data Cleaning and Preparation

In [None]:
X.iloc[:,7:9].astype('category')
flights_sample = X.sample(n=10000)
flights_sample.dtypes

In [None]:
flights_sample.columns

### Statistical Analysis 

#### Correleation Analysis

In [None]:
# Create correlation matrix
corr_matrix = flights_sample.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.30
highcorr_matrix = upper[(upper > 0.45)]

highcorr_mat = highcorr_matrix.dropna(how='all')
highcorr_mat

#### Analysis of Potential Variable Interactions and Transformations 

In [None]:
flights_sample.columns

##### Plots of Predictors Against Response Variable

In [None]:
#Jittering points to better see the density of points in any given region of the plot
def jitter(values,j):
    return values + np.random.normal(j,0.02,values.shape)

for i in flights_sample.columns:
    if (flights_sample[str(i)].dtype == 'int64' or flights_sample[str(i)].dtype == 'float64') and (str(i) != 'departure_delay') :
        plt.figure(figsize=(12,9), layout = 'constrained')
        plt.scatter(jitter(flights_sample[i],0),jitter(flights_sample['departure_delay'],0))
        plt.title(str(i) + ' vs. Departure Delay')
        plt.xlabel(str(i))
        plt.ylabel('Departure Delay')
        # Show/save figure as desired.
        plt.show()

##### Residual Plots of Predictors Against Response Variable

##### Distribution of Predictors

In [None]:
#Jittering points to better see the density of points in any given region of the plot
def jitter(values,j):
    return values + np.random.normal(j,0.02,values.shape)

for i in flights_sample.columns:
    if (flights_sample[str(i)].dtype == 'int64' or flights_sample[str(i)].dtype == 'float64'):
        sns.set(rc = {'figure.figsize':(20,12)})
        sns.set(font_scale = 2)
        sns.histplot(flights_sample[str(i)],kde=True)
        plt.show()


##### 2D Histogram Binning 

In [None]:
import plotly.graph_objects as go

import numpy as np

x = np.random.randn(10)
y = np.random.randn(10)+1

fig = go.Figure(go.Histogram2d(x=x, y=y, histnorm='probability',
        autobinx=False,
        xbins=dict(start=-3, end=3, size=0.1),
        autobiny=False,
        ybins=dict(start=-2.5, end=4, size=0.1),
        colorscale=[[0, 'rgb(12,51,131)'], [0.25, 'rgb(10,136,186)'], [0.5, 'rgb(242,211,56)'], [0.75, 'rgb(242,143,56)'], [1, 'rgb(217,30,30)']]
    ))
fig.show()

### Linear Regression Model Generation

In [None]:
model = smf.ols(formula='departure_delay ~  taxi_out + longitude + scheduled_time + month + day', data=flights).fit()
model.summary()

In [None]:
flights_sample.columns

In [None]:
model = smf.ols(formula= 'departure_delay ~  month*day + I(taxi_out**2) + I(taxi_in**2) + distance + I(distance**2) + scheduled_departure + longitude + origin_airport', data=flights_sample).fit()
model.summary()

In [None]:
flights_sample.origin_airport.astype('category')

In [None]:
flights_sample

In [None]:
flights_sample.dtypes