## `Flight Delay Prediction`
### **Part 2:** Data Cleaning and Feature Selection

Ali Bahrami

In [32]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

%matplotlib inline


In [None]:
# Read from extracted data
df = pd.read_csv("data/flights_test.csv")

In [35]:
# Perform all the operations conducted on training data to the evaluation data.
# To save time, df_flights in this notebook is taken from flights_test
df_flights = df.copy()
df_flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [36]:
# The airlines names are given as abbreviations only, import the full name from the files.
airlines = pd.read_csv("data/airlines.csv")
abbr_airlines = airlines.set_index('IATA_CODE')['AIRLINE'].to_dict()

# Add the full airline names to the dataframe
df_flights['carrier'] = df_flights['mkt_unique_carrier'].replace(abbr_airlines)


# Add separate columns for year, month, day of month, and day of week
df_flights['fl_date'] = pd.to_datetime(df_flights['fl_date'],format='%Y-%m-%d')
df_flights['month'] = pd.DatetimeIndex(df_flights['fl_date']).month
df_flights['day_of_week'] = pd.DatetimeIndex(df_flights['fl_date']).dayofweek




In [37]:

def bin_hours(time):
    # Bin the time from HHMM input format to 24 hrs where 23=11pm and 0=12am
    if type(time) == 'numpy.ndarray':
        time = time.astype(int)
    time = int(time)
    
    t = time // 100
    
    if t == 0:
        return 0
    return t

# Bin the departure time into hours
df_flights.loc[:,'crs_dep_time'] = df_flights['crs_dep_time'].apply(bin_hours)
df_flights.loc[:,'crs_arr_time'] = df_flights['crs_arr_time'].apply(bin_hours)



In [38]:
mean_hist_dep_delay = pd.read_csv("data/mean_hist_dep_delay.csv")
mean_hist_arr_delay = pd.read_csv("data/mean_hist_arr_delay.csv")

In [39]:
merged = pd.merge(df_flights, mean_hist_dep_delay, on=['crs_dep_time'], how='left')
df_flights = pd.merge(merged, mean_hist_arr_delay, on=['crs_arr_time'], how='left')

In [40]:
df_flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_arr_time,dup,crs_elapsed_time,flights,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,...,19,N,95,1,363,Southwest Airlines,1,2,10.983196,5.387877
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,...,13,N,90,1,363,Southwest Airlines,1,2,4.860123,-0.927422
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,...,21,N,70,1,333,Southwest Airlines,1,2,11.688123,5.265583
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,...,14,N,75,1,333,Southwest Airlines,1,2,6.750271,0.053946
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,...,10,N,80,1,333,Southwest Airlines,1,2,3.246698,-3.180434


In [41]:
## We had included the air_time by error and to complete the assignment, I am gonna input the mean here, but this needs to be fixed
df_flights.insert (6, "air_time", 107.6973116856836)

In [42]:
df_flights = df_flights[['origin', 'dest', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'air_time', 'distance', 'carrier', 'month',
       'day_of_week', 'mean_hist_dep_delay', 'mean_hist_arr_delay']]

df_flights.head()

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,air_time,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,ONT,SFO,18,19,95,107.697312,363,Southwest Airlines,1,2,10.983196,5.387877
1,ONT,SFO,11,13,90,107.697312,363,Southwest Airlines,1,2,4.860123,-0.927422
2,ONT,SJC,20,21,70,107.697312,333,Southwest Airlines,1,2,11.688123,5.265583
3,ONT,SJC,13,14,75,107.697312,333,Southwest Airlines,1,2,6.750271,0.053946
4,ONT,SJC,9,10,80,107.697312,333,Southwest Airlines,1,2,3.246698,-3.180434


## Label Encode

In [43]:
# Origin
le_origin = LabelEncoder()
le_origin.fit(df_flights.origin)
df_flights.loc[:,"origin"] = le_origin.transform(df_flights.origin)

In [44]:
# Dest
le_dest = LabelEncoder()
le_dest.fit(df_flights.dest)
df_flights.loc[:,"dest"] = le_dest.transform(df_flights.dest)

In [45]:
# Carrier
le_carrier = LabelEncoder()
le_carrier.fit(df_flights.carrier)
df_flights.loc[:,"carrier"] = le_carrier.transform(df_flights.carrier)

In [48]:
df_flights.dtypes

origin                   int64
dest                     int64
crs_dep_time             int64
crs_arr_time             int64
crs_elapsed_time         int64
air_time               float64
distance                 int64
carrier                  int64
month                    int64
day_of_week              int64
mean_hist_dep_delay    float64
mean_hist_arr_delay    float64
dtype: object

In [49]:
df_flights.head()

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,air_time,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,246,312,18,19,95,107.697312,363,7,1,2,10.983196,5.387877
1,246,312,11,13,90,107.697312,363,7,1,2,4.860123,-0.927422
2,246,318,20,21,70,107.697312,333,7,1,2,11.688123,5.265583
3,246,318,13,14,75,107.697312,333,7,1,2,6.750271,0.053946
4,246,318,9,10,80,107.697312,333,7,1,2,3.246698,-3.180434


In [50]:
df_flights.shape

(150623, 12)

## Scale the data and prepare it for modeling

In [51]:
df_model = df_flights.copy()

In [54]:
X = df_model

In [55]:
scaler = StandardScaler()
scaler.fit(X)
scaled_df = scaler.fit_transform(X)

In [56]:
df_flights.columns

Index(['origin', 'dest', 'crs_dep_time', 'crs_arr_time', 'crs_elapsed_time',
       'air_time', 'distance', 'carrier', 'month', 'day_of_week',
       'mean_hist_dep_delay', 'mean_hist_arr_delay'],
      dtype='object')

In [57]:
flights_scaled = pd.DataFrame(data = scaled_df,
              columns = ['origin', 'dest', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'air_time', 'distance', 'carrier', 'month',
       'day_of_week', 'mean_hist_dep_delay', 'mean_hist_arr_delay'])

In [58]:
flights_scaled

Unnamed: 0,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,air_time,distance,carrier,month,day_of_week,mean_hist_dep_delay,mean_hist_arr_delay
0,0.701911,1.378212,1.002153,0.847341,-0.660447,-2.842171e-14,-0.717995,0.775433,0.0,-0.495615,1.205286,1.138352
1,0.701911,1.378212,-0.420511,-0.298153,-0.728836,-2.842171e-14,-0.717995,0.775433,0.0,-0.495615,-0.331726,-0.534287
2,0.701911,1.439675,1.408629,1.229172,-1.002392,-2.842171e-14,-0.768505,0.775433,0.0,-0.495615,1.382236,1.105962
3,0.701911,1.439675,-0.014036,-0.107237,-0.934003,-2.842171e-14,-0.768505,0.775433,0.0,-0.495615,0.142738,-0.274366
4,0.701911,1.439675,-0.826987,-0.870900,-0.865614,-2.842171e-14,-0.768505,0.775433,0.0,-0.495615,-0.736727,-1.131008
...,...,...,...,...,...,...,...,...,...,...,...,...
150618,-0.803562,-0.014950,0.798916,0.847341,-0.359536,-2.842171e-14,-0.472176,-0.592137,0.0,-0.991589,0.980384,1.138352
150619,-0.373427,0.189926,-1.436700,-1.443647,-0.332181,-2.842171e-14,-0.302124,-0.592137,0.0,-0.991589,-1.738414,-1.716762
150620,-1.612626,1.849429,0.798916,0.656425,-0.304825,-2.842171e-14,-0.337481,-0.592137,0.0,-0.991589,0.980384,0.922617
150621,1.848939,-1.612989,1.002153,1.229172,-0.400570,-2.842171e-14,-0.337481,-0.592137,0.0,-0.991589,1.205286,1.105962


In [59]:
# This is the scaled test data that we will use to predict the model on.
flights_scaled.to_csv("data/test_eval.csv", index=False)