## Part I: Data preparation and feature engineering

#### Data preparation 

In [24]:
# Import libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

#### Part 1: Training Data

(this data was extracted earlier from a database; the training data is a small sample of that data.)

In [25]:
flight_delay = pd.read_csv("flight_information_dates.csv")

In [26]:
# Flights should take longer than 0 minutes, and get rid of extreme early outliers

flight_delay = flight_delay[flight_delay['air_time']>0]

#Don't consider diverted or cancelled flights
flight_delay.drop(flight_delay[(flight_delay['cancelled']==1) | (flight_delay['diverted']==1)].index, inplace=True)

In [27]:

# Extract desired features into new df
model_df = flight_delay[['fl_date', 'mkt_unique_carrier', 'crs_dep_time', 'crs_arr_time', 'origin', 'dest', 'arr_delay', 'distance']]

#Only consider flights that are at least 60 minutes early (other ones might be have had departures misclassified )
model_df = model_df[model_df['arr_delay']>-60]

# Get rid of negatives; early flights are weird (remember, "where" replaces conditions that are false)
model_df['arr_delay'] = model_df['arr_delay'].where(model_df['arr_delay'] > 0, 0)

model_df = model_df.fillna(0)

# If using departure/arrival times, extract only the hours
model_df['dep_hr'] = (model_df['crs_dep_time'] // 100).astype('str')
model_df['arr_hr'] = (model_df['crs_arr_time'] // 100).astype('str')
model_df = model_df.drop(columns=['crs_dep_time', 'crs_arr_time'])

# Aggregate based on time of day (morning, afternoon, evening)
model_df['dep_time_of_day'] = model_df['dep_hr'] # just create the column to start
model_df['arr_time_of_day'] = model_df['arr_hr'] # just create the column to start
model_df['dep_hr'] = pd.to_numeric(model_df['dep_hr']) # Needs to be an integer for our filtering below
model_df['arr_hr'] = pd.to_numeric(model_df['arr_hr']) # Needs to be an integer for our filtering below

model_df.loc[(model_df['dep_hr'] >=5) & (model_df['dep_hr'] <12), 'dep_time_of_day'] = 'morn'
model_df.loc[(model_df['dep_hr'] >=12) & (model_df['dep_hr'] <17), 'dep_time_of_day'] = 'aft'
model_df.loc[(model_df['dep_hr'] >=17) | (model_df['dep_hr'] <5), 'dep_time_of_day'] = 'eve'

model_df.loc[(model_df['arr_hr'] >=5) & (model_df['arr_hr'] <12), 'arr_time_of_day'] = 'morn'
model_df.loc[(model_df['arr_hr'] >=12) & (model_df['arr_hr'] <17), 'arr_time_of_day'] = 'aft'
model_df.loc[(model_df['arr_hr'] >=17) |  (model_df['arr_hr'] <5), 'arr_time_of_day'] = 'eve'


# Extract month and day of week from the flight dates 
model_df['month'] = pd.DatetimeIndex(model_df['fl_date']).month
model_df['weekday'] = pd.DatetimeIndex(model_df['fl_date']).weekday
model_df = model_df.drop(columns=['fl_date'])

# Aggregate into season, based on months
model_df['season'] = model_df['month']
model_df['month'] = pd.to_numeric(model_df['month'] ) # Needs to be an integer for our filtering below

model_df.loc[(model_df['month'] >= 3) & (model_df['month'] < 6), 'season' ] = 'spr'
model_df.loc[(model_df['month'] >= 6) & (model_df['month'] < 9), 'season'] = 'sum'
model_df.loc[(model_df['month'] >= 9) & (model_df['month'] < 12), 'season'] = 'aut'
model_df.loc[(model_df['month'] == 12) | (model_df['month'] < 3), 'season'] = 'win'

# and make ready for categorical
model_df['month'] = model_df['month'].replace({1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
                            5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"})
model_df['weekday'] = model_df['weekday'].replace({0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu",
                            4: "Fri", 5: "Sat", 6: "Sun"})


# While we're here, let's categorize airlines into "large" and "small" (the airlines chosen based on exploratory analysis)
model_df['carrier_size'] = model_df['mkt_unique_carrier'] # just create the column to start
model_df.loc[(model_df['mkt_unique_carrier'] == 'UA') | (model_df['mkt_unique_carrier'] == 'AA') | \
            (model_df['mkt_unique_carrier'] == 'WN') | (model_df['mkt_unique_carrier'] == 'DL') | \
            (model_df['mkt_unique_carrier'] == 'AS'), 'carrier_size'] = 'large' 

model_df.loc[(model_df['mkt_unique_carrier'] == 'VX') | (model_df['mkt_unique_carrier'] == 'B6') | \
            (model_df['mkt_unique_carrier'] == 'HA') | (model_df['mkt_unique_carrier'] == 'F9') | \
            (model_df['mkt_unique_carrier'] == 'G4')| (model_df['mkt_unique_carrier'] == 'NK'), 'carrier_size'] = 'small'


# Distance logging and normalizing
model_df['norm_dist'] = np.log10(model_df[['distance']])
norm_dist = np.array(model_df['norm_dist']).reshape(-1,1)
scaler = MinMaxScaler()
scaler.fit(norm_dist)
model_df['norm_dist'] = scaler.transform(norm_dist)
model_df.drop('distance', axis=1, inplace=True)

# Change hours of arrival/departure to strings so we can classify them by time of day (morning/afternoon/evening)
model_df['dep_hr'] = model_df['dep_hr'].astype('str')
model_df['arr_hr'] = model_df['arr_hr'].astype('str')

#If needed, we can also create a 'medium' category: F9, B6, NK, AS
 
# # You know what, while we're here, let's categorize flight distance too 
# model_df['distance_cat'] = model_df['distance']
# model_df['distance'] = pd.to_numeric(model_df['distance'] ) # Needs to be an integer for our filtering below

# model_df.loc[(model_df['distance'] < 400) , 'distance_cat' ] = 'short'
# model_df.loc[(model_df['distance'] >= 400) & (model_df['distance'] < 800), 'distance_cat'] = 'med'
# model_df.loc[(model_df['distance'] >= 800) , 'distance_cat' ] = 'long'
# model_df.drop('distance', axis=1, inplace=True)



In [28]:
model_df.to_csv("flight_train_ready.csv")


#### Part 2: Test Data

(data taken from the provided file to test our model)

In [35]:

# Above, we initially named file "flight_delay", then did some processing, then renamed model_df; here, we
# do not need to complete that processing, so we initially name the file "model_df" 
model_df = pd.read_csv("flight_test.csv")

In [36]:
model_df.columns

Index(['Unnamed: 0', 'fl_date', 'mkt_unique_carrier', 'origin', 'dest',
       'crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance'],
      dtype='object')

In [37]:

model_df = model_df.fillna(0)

# If using departure/arrival times, extract only the hours
model_df['dep_hr'] = (model_df['crs_dep_time'] // 100).astype('str')
model_df['arr_hr'] = (model_df['crs_arr_time'] // 100).astype('str')
model_df = model_df.drop(columns=['crs_dep_time', 'crs_arr_time'])

# Aggregate based on time of day (morning, afternoon, evening)
model_df['dep_time_of_day'] = model_df['dep_hr'] # just create the column to start
model_df['arr_time_of_day'] = model_df['arr_hr'] # just create the column to start
model_df['dep_hr'] = pd.to_numeric(model_df['dep_hr']) # Needs to be an integer for our filtering below
model_df['arr_hr'] = pd.to_numeric(model_df['arr_hr']) # Needs to be an integer for our filtering below

model_df.loc[(model_df['dep_hr'] >=5) & (model_df['dep_hr'] <12), 'dep_time_of_day'] = 'morn'
model_df.loc[(model_df['dep_hr'] >=12) & (model_df['dep_hr'] <17), 'dep_time_of_day'] = 'aft'
model_df.loc[(model_df['dep_hr'] >=17) | (model_df['dep_hr'] <5), 'dep_time_of_day'] = 'eve'

model_df.loc[(model_df['arr_hr'] >=5) & (model_df['arr_hr'] <12), 'arr_time_of_day'] = 'morn'
model_df.loc[(model_df['arr_hr'] >=12) & (model_df['arr_hr'] <17), 'arr_time_of_day'] = 'aft'
model_df.loc[(model_df['arr_hr'] >=17) |  (model_df['arr_hr'] <5), 'arr_time_of_day'] = 'eve'


# Extract month and day of week from the flight dates 
model_df['month'] = pd.DatetimeIndex(model_df['fl_date']).month
model_df['weekday'] = pd.DatetimeIndex(model_df['fl_date']).weekday
model_df = model_df.drop(columns=['fl_date'])

# Aggregate into season, based on months
model_df['season'] = model_df['month']
model_df['month'] = pd.to_numeric(model_df['month'] ) # Needs to be an integer for our filtering below

model_df.loc[(model_df['month'] >= 3) & (model_df['month'] < 6), 'season' ] = 'spr'
model_df.loc[(model_df['month'] >= 6) & (model_df['month'] < 9), 'season'] = 'sum'
model_df.loc[(model_df['month'] >= 9) & (model_df['month'] < 12), 'season'] = 'aut'
model_df.loc[(model_df['month'] == 12) | (model_df['month'] < 3), 'season'] = 'win'

# and make ready for categorical
model_df['month'] = model_df['month'].replace({1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr",
                            5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"})
model_df['weekday'] = model_df['weekday'].replace({0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu",
                            4: "Fri", 5: "Sat", 6: "Sun"})


# While we're here, let's categorize airlines into "large" and "small" (the airlines chosen based on exploratory analysis)
model_df['carrier_size'] = model_df['mkt_unique_carrier'] # just create the column to start
model_df.loc[(model_df['mkt_unique_carrier'] == 'UA') | (model_df['mkt_unique_carrier'] == 'AA') | \
            (model_df['mkt_unique_carrier'] == 'WN') | (model_df['mkt_unique_carrier'] == 'DL') | \
            (model_df['mkt_unique_carrier'] == 'AS'), 'carrier_size'] = 'large' 

model_df.loc[(model_df['mkt_unique_carrier'] == 'VX') | (model_df['mkt_unique_carrier'] == 'B6') | \
            (model_df['mkt_unique_carrier'] == 'HA') | (model_df['mkt_unique_carrier'] == 'F9') | \
            (model_df['mkt_unique_carrier'] == 'G4')| (model_df['mkt_unique_carrier'] == 'NK'), 'carrier_size'] = 'small'


# Distance logging and normalizing
model_df['norm_dist'] = np.log10(model_df[['distance']])
norm_dist = np.array(model_df['norm_dist']).reshape(-1,1)
scaler = MinMaxScaler()
scaler.fit(norm_dist)
model_df['norm_dist'] = scaler.transform(norm_dist)
model_df.drop('distance', axis=1, inplace=True)

# Change hours of arrival/departure to strings so we can classify them by time of day (morning/afternoon/evening)
model_df['dep_hr'] = model_df['dep_hr'].astype('str')
model_df['arr_hr'] = model_df['arr_hr'].astype('str')

#If needed, we can also create a 'medium' category: F9, B6, NK, AS
 
# # You know what, while we're here, let's categorize flight distance too 
# model_df['distance_cat'] = model_df['distance']
# model_df['distance'] = pd.to_numeric(model_df['distance'] ) # Needs to be an integer for our filtering below

# model_df.loc[(model_df['distance'] < 400) , 'distance_cat' ] = 'short'
# model_df.loc[(model_df['distance'] >= 400) & (model_df['distance'] < 800), 'distance_cat'] = 'med'
# model_df.loc[(model_df['distance'] >= 800) , 'distance_cat' ] = 'long'
# model_df.drop('distance', axis=1, inplace=True)


In [39]:

model_df.to_csv("flight_test_ready.csv")