# Regression Problem - Feature Engineering & Selection

In [1]:
# The essentials 
import json
import pandas as pd
import numpy as np

# Some custom functions
from functions import dummify

In [2]:
# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE, VarianceThreshold


In [3]:
import warnings
warnings.filterwarnings('ignore')





## The list of desired features:

* carrier_on_time_pct -  The percentage of time that flights from a given carrier arrive early or on time.
* carrier_mean_delay = The average arrival delta for the carrier operating the flight.
* plant_on_time_pct - The percentage of time that a given plane, based on tail number, arrives early or on time.
* origin_on_time_pct - The percentage of time that a plane departing a given airport arrives on time
#### * **dest_on_time_pct - The percentage of time that a plane arriving at a given airport arrives on time**  ----> Needs to be added
* origin_mean_delay - The average arrival delta for a plane leaving from a given airport
#### * **dest_mean_delay - The average arrival delta for a plane arriving at a given airport.** ----> Needs to be added
* fl_num_on_time_pct - The percentage of time that a given flight number arrives on time.
* route_on_time_pct - The percentage of time that a plane flying a given route arrives on time.
* carrier_fuel_consumption - The amount of fuel used per passenger, per mile travelled
* mean_taxi_out - The average taxi time for a plane leaving a given airport
* mean_taxi_in - The average taxi time for a plane arriving at a given airport

### Let's load the data with any of the changes made during our exploratory data analysis

In [4]:
data = pd.read_csv('data/post_explore.csv')

In [5]:
# Put our regression target variable aside for now.

target = data['arr_delay']

Let's start off by loading the dictionaries created during the exploratory data analysis phase. <br>
These will be used to engineer new features in the following cells.

In [6]:
# Carrier on-time percent
with open(f'data/dicts/carrier_pct.json') as my_file:
    temp = (my_file.read())
    carrier_pct = json.loads(temp)
    
# Plane(tail number) on-time percent
with open(f'data/dicts/plane_pct.json') as my_file:
    temp = (my_file.read())
    plane_pct = json.loads(temp)
    
# On-time percent for planes leaving origin airport
with open(f'data/dicts/origin_pct.json') as my_file:
    temp = (my_file.read())
    origin_pct = json.loads(temp)

# On-time percent for flight number
with open(f'data/dicts/fl_num_pct.json') as my_file:
    temp = (my_file.read())
    fl_num_pct = json.loads(temp)

# On-time percent for route
with open(f'data/dicts/route_pct.json') as my_file:
    temp = (my_file.read())
    route_pct = json.loads(temp)

# Fuel consumption per passenger per mile by carrier 
with open(f'data/dicts/fuel_dict.json') as my_file:
    temp = (my_file.read())
    fuel_dict = json.loads(temp)
    
# Mean taxi-out time for each airport
with open(f'data/dicts/mean_taxi_out.json') as my_file:
    temp = (my_file.read())
    mean_taxi_out = json.loads(temp)

# Mean taxi-in time for each airport
with open(f'data/dicts/mean_taxi_in.json') as my_file:
    temp = (my_file.read())
    mean_taxi_in = json.loads(temp)
    
# Mean arrival delay of each carrier
with open(f'data/dicts/carrier_mean_delay.json') as my_file:
    temp = (my_file.read())
    carrier_mean_delay = json.loads(temp)
    
# Mean arrival delay for each airport
with open(f'data/dicts/origin_mean_delay.json') as my_file:
    temp = (my_file.read())
    origin_mean_delay = json.loads(temp)



In [7]:
# Need to change the data type of mkt_carrier_fl_num to allow for proper mapping

data['mkt_carrier_fl_num'] = data['mkt_carrier_fl_num'].astype(str)

In [8]:
# Map the engineered features to the data

data['carrier_on_time_pct'] = data['mkt_unique_carrier'].map(carrier_pct) # How often is the carrier on time
data['carrier_mean_delay'] = data['mkt_unique_carrier'].map(carrier_mean_delay) # How far off schedule is this carrier
data['plane_on_time_pct'] = data['tail_num'].map(plane_pct) # How often is this plane on time
data['origin_on_time_pct'] = data['origin'].map(origin_pct) # How often are planes that leave this airport on time
data['origin_mean_delay'] = data['origin'].map(origin_mean_delay) # Origin mean delay
data['fl_num_on_time_pct'] = data['mkt_carrier_fl_num'].map(fl_num_pct) # How often is the flight on time
data['route_on_time_pct'] = data['route'].map(route_pct) # How often is the route on time
data['route_on_time_pct'] = data['route_on_time_pct'].replace({np.inf: 1}) 
data['carrier_fuel_consumption'] = data['mkt_unique_carrier'].map(fuel_dict) #Carrier fuel consumption

# Add mean taxi out to the dataset
data['mean_taxi_out'] = data['origin'].map(mean_taxi_out).rename({'mean_taxi_out': 'origin_mean_taxi_out'})
data['mean_taxi_in'] = data['origin'].map(mean_taxi_in).rename({'mean_taxi_in': 'dest_mean_taxi_in'})

In [9]:
data.head()

Unnamed: 0.1,Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,tail_num,origin,dest,crs_dep_time,crs_arr_time,arr_delay,cancelled,...,carrier_on_time_pct,carrier_mean_delay,plane_on_time_pct,origin_on_time_pct,origin_mean_delay,fl_num_on_time_pct,route_on_time_pct,carrier_fuel_consumption,mean_taxi_out,mean_taxi_in
0,0,NK,393,N506NK,ATL,MSY,1541,1618,-7.0,0.0,...,0.782703,-8.768192,0.71875,0.813196,-8.375575,0.784,0.768194,3.57894e-07,15.823849,6.135301
1,2,NK,443,N681NK,BWI,FLL,2006,2245,13.0,0.0,...,0.782703,-8.768192,0.785088,0.764693,-8.499853,0.786408,0.686154,3.57894e-07,14.308257,6.756612
2,3,NK,445,N504NK,CMH,FLL,600,839,-5.0,0.0,...,0.782703,-8.768192,0.773196,0.74951,-8.277317,0.769231,0.740458,3.57894e-07,16.684853,8.97061
3,4,NK,446,N621NK,LAS,ORD,835,1413,-6.0,0.0,...,0.782703,-8.768192,0.833333,0.743262,-6.651488,0.697318,0.784884,3.57894e-07,15.204628,7.547092
4,5,NK,446,N621NK,ORD,ATL,1503,1803,6.0,0.0,...,0.782703,-8.768192,0.833333,0.725778,-6.686725,0.697318,0.725821,3.57894e-07,22.281494,6.616842


In [10]:
# Let's choose the columns that we will be using for regression modeling.
modeling_cols = ['distance',
                 'origin_weather_condition',
                 'origin_weather_severity',
                 'dest_weather_condition',
                 'dest_weather_severity',
                 'week_day',
                 'crs_dep_hour',
                 'crs_arr_hour',
                 'carrier_on_time_pct',
                 'plane_on_time_pct',
                  'origin_mean_delay',
                  'carrier_mean_delay',
                 'origin_on_time_pct',
                 'fl_num_on_time_pct',
                 'route_on_time_pct',
                 'carrier_fuel_consumption',
                 'mean_taxi_out',
                 'mean_taxi_in']

data = data[modeling_cols]

In [11]:
# Get lists of numeric columns and object columns to facilitate making dummy variables and scaling

numeric_cols = ['distance',
               'crs_dep_hour',
               'crs_arr_hour',
               'carrier_on_time_pct',
               'plane_on_time_pct',
               'fl_num_on_time_pct',
               'origin_on_time_pct',
               'route_on_time_pct',
               'mean_taxi_out', 
                'mean_taxi_in',
               'carrier_fuel_consumption',
                'carrier_mean_delay',
                'origin_mean_delay'
               ]

obj_cols = data.drop(columns = numeric_cols).columns.to_list()
obj_cols

['origin_weather_condition',
 'origin_weather_severity',
 'dest_weather_condition',
 'dest_weather_severity',
 'week_day']

In [12]:
# Create Dummy variables for the object columns

for col in obj_cols:
    temp_df = dummify(data, col)
    data = data.merge(temp_df, 
                               left_index = True, 
                               right_index = True)
    
# Drop object columns after dummies have been created

data = data.drop(columns = obj_cols)
data.head()

Unnamed: 0,distance,crs_dep_hour,crs_arr_hour,carrier_on_time_pct,plane_on_time_pct,origin_mean_delay,carrier_mean_delay,origin_on_time_pct,fl_num_on_time_pct,route_on_time_pct,...,type_is_Light_y,type_is_Moderate_y,type_is_Severe_y,type_is_Friday,type_is_Monday,type_is_Saturday,type_is_Sunday,type_is_Thursday,type_is_Tuesday,type_is_Wednesday
0,425.0,15,16,0.782703,0.71875,-8.375575,-8.768192,0.813196,0.784,0.768194,...,1,0,0,0,0,1,0,0,0,0
1,925.0,20,22,0.782703,0.785088,-8.499853,-8.768192,0.764693,0.786408,0.686154,...,1,0,0,0,0,1,0,0,0,0
2,973.0,6,8,0.782703,0.773196,-8.277317,-8.768192,0.74951,0.769231,0.740458,...,1,0,0,0,0,1,0,0,0,0
3,1514.0,8,14,0.782703,0.833333,-6.651488,-8.768192,0.743262,0.697318,0.784884,...,1,0,0,0,0,1,0,0,0,0
4,606.0,15,18,0.782703,0.833333,-6.686725,-8.768192,0.725778,0.697318,0.725821,...,1,0,0,0,0,1,0,0,0,0


In [13]:
# Apply scaling to the numeric columns

scaler = StandardScaler()
numerics_scaled = scaler.fit_transform(data[numeric_cols])

# Rename columns from scaling back to their original names, merge with data

numeric_dict = {0: 'distance',
               1: 'crs_dep_hour',
               2: 'crs_arr_hour',
               3: 'carrier_on_time_pct',
               4: 'plane_on_time_pct',
               5: 'fl_num_on_time_pct',
               6: 'origin_on_time_pct',
               7: 'route_on_time_pct',
               8: 'origin_taxi_out',
               9: 'carrier_fuel_consumption',
               10: 'mean_taxi_in',
               11: 'carrier_mean_delay',
               12: 'origin_mean_delay'}

numerics_scaled = pd.DataFrame(numerics_scaled).rename(columns = numeric_dict)
data = data.drop(columns = numeric_cols)
data = data.merge(numerics_scaled, left_index=True, right_index=True)

In [14]:
# Add our target variable back to the dataframe, check it
data['arr_delay'] = target
data.head()

Unnamed: 0,type_is_Adverse_x,type_is_Clear_x,type_is_Light_x,type_is_Moderate_x,type_is_Severe_x,type_is_Adverse_y,type_is_Clear_y,type_is_Light_y,type_is_Moderate_y,type_is_Severe_y,...,plane_on_time_pct,fl_num_on_time_pct,origin_on_time_pct,route_on_time_pct,origin_taxi_out,carrier_fuel_consumption,mean_taxi_in,carrier_mean_delay,origin_mean_delay,arr_delay
0,0,1,1,0,0,0,1,1,0,0,...,-0.762141,0.367852,1.15577,0.042167,-0.475455,-1.159495,1.081105,-0.19294,-0.011615,-7.0
1,1,0,1,0,0,1,0,1,0,0,...,0.327712,0.414815,-0.010761,-1.090683,-0.932123,-0.710796,1.081105,-0.19294,-0.058432,13.0
2,0,1,1,0,0,0,1,1,0,0,...,0.132343,0.079782,-0.375917,-0.340825,-0.216023,0.888111,1.081105,-0.19294,0.025399,-5.0
3,0,1,1,0,0,0,1,1,0,0,...,1.120333,-1.322857,-0.526199,0.272626,-0.662034,-0.139927,1.081105,-0.19294,0.637861,-6.0
4,0,1,1,0,0,1,0,1,0,0,...,1.120333,-1.322857,-0.946699,-0.542943,1.47032,-0.811735,1.081105,-0.19294,0.624587,6.0


In [15]:
data.isna().sum()

type_is_Adverse_x           0
type_is_Clear_x             0
type_is_Light_x             0
type_is_Moderate_x          0
type_is_Severe_x            0
type_is_Adverse_y           0
type_is_Clear_y             0
type_is_Light_y             0
type_is_Moderate_y          0
type_is_Severe_y            0
type_is_Friday              0
type_is_Monday              0
type_is_Saturday            0
type_is_Sunday              0
type_is_Thursday            0
type_is_Tuesday             0
type_is_Wednesday           0
distance                    0
crs_dep_hour                0
crs_arr_hour                0
carrier_on_time_pct         0
plane_on_time_pct           0
fl_num_on_time_pct          0
origin_on_time_pct          0
route_on_time_pct           0
origin_taxi_out             0
carrier_fuel_consumption    0
mean_taxi_in                0
carrier_mean_delay          0
origin_mean_delay           0
arr_delay                   0
dtype: int64

In [16]:
# Save dataframe to csv
data.to_csv('data/regression_data.csv')

# Multi-Class Classification Problem - Feature Engineering & Selection

Will use this section to engineer the target variable that is needed for the stretch multiclass classification goal

In [17]:
# Open the csv again
class_data = pd.read_csv('data/post_explore.csv')

In [18]:
# Create a column of zeroes, select the delay columns we want to use as the target.
class_data['zeroes'] = 0
delay_types = class_data[['zeroes','carrier_delay','weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']]

delay_types.shape

(1079053, 6)

In [19]:
# Use the pandas idxmax function to select the column containing the greatest value across each row.
# This allows us to select the biggest contributor to a flight's delay

delay_types['delay_reason'] = delay_types.idxmax(axis = 'columns')
delay_types['delay_reason'] = delay_types['delay_reason'].replace({'zeroes': 'no_delay/undefined'})

In [20]:
# Drop the columns that we no longer need.  Check that the values obtained make sense
delay_types.drop(columns=['zeroes','carrier_delay','weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'], inplace=True)
delay_types.value_counts()

delay_reason       
no_delay/undefined     1012949
nas_delay                49312
carrier_delay             9586
late_aircraft_delay       6265
weather_delay              803
security_delay             138
dtype: int64

In [21]:
# Create our multiclass dataframe, check it
mc_data = data.merge(delay_types, left_index = True, right_index = True)
mc_data.drop('arr_delay',axis=1, inplace=True)
mc_data.head()

Unnamed: 0,type_is_Adverse_x,type_is_Clear_x,type_is_Light_x,type_is_Moderate_x,type_is_Severe_x,type_is_Adverse_y,type_is_Clear_y,type_is_Light_y,type_is_Moderate_y,type_is_Severe_y,...,plane_on_time_pct,fl_num_on_time_pct,origin_on_time_pct,route_on_time_pct,origin_taxi_out,carrier_fuel_consumption,mean_taxi_in,carrier_mean_delay,origin_mean_delay,delay_reason
0,0,1,1,0,0,0,1,1,0,0,...,-0.762141,0.367852,1.15577,0.042167,-0.475455,-1.159495,1.081105,-0.19294,-0.011615,no_delay/undefined
1,1,0,1,0,0,1,0,1,0,0,...,0.327712,0.414815,-0.010761,-1.090683,-0.932123,-0.710796,1.081105,-0.19294,-0.058432,no_delay/undefined
2,0,1,1,0,0,0,1,1,0,0,...,0.132343,0.079782,-0.375917,-0.340825,-0.216023,0.888111,1.081105,-0.19294,0.025399,no_delay/undefined
3,0,1,1,0,0,0,1,1,0,0,...,1.120333,-1.322857,-0.526199,0.272626,-0.662034,-0.139927,1.081105,-0.19294,0.637861,no_delay/undefined
4,0,1,1,0,0,1,0,1,0,0,...,1.120333,-1.322857,-0.946699,-0.542943,1.47032,-0.811735,1.081105,-0.19294,0.624587,no_delay/undefined


In [22]:
# Save multiclass df to CSV
mc_data.to_csv('data/multiclass_data.csv')