## Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
from src.modules.feature_engineering import scale_encoder, num_null_replacement
from src.modules.load_source_data import load_data_set, load_agg_data

In [2]:
# get X and y (target variable)
# Load data
X, y = load_data_set()

# Save y
y.to_csv('./data/target.csv', index=False)
print(X.shape)
print(y.shape)

(9625, 27)
(9625,)


In [3]:
pd.set_option("display.max_columns", None)
X.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,year,month,day,day_of_week,week_of_year,crs_dep_hour,crs_arr_hour
0,2018-10-30,UA,UA_CODESHARE,UA,3997,EV,N13903,3997,12266,IAH,"Houston, TX",13256,MFE,"Mission/McAllen/Edinburg, TX",1455,1617,N,82,1,316,2018,10,30,2,44,14,16
1,2018-10-30,F9,F9,F9,529,F9,N301FR,529,10599,BHM,"Birmingham, AL",11292,DEN,"Denver, CO",1315,1518,N,183,1,1083,2018,10,30,2,44,13,15
2,2018-10-30,AA,AA,AA,2318,AA,N167AN,2318,11298,DFW,"Dallas/Fort Worth, TX",12892,LAX,"Los Angeles, CA",1740,1853,N,193,1,1235,2018,10,30,2,44,17,18
3,2018-10-30,AA,AA,AA,2375,AA,N962NN,2375,11298,DFW,"Dallas/Fort Worth, TX",14869,SLC,"Salt Lake City, UT",1855,2048,N,173,1,989,2018,10,30,2,44,18,20
4,2018-10-31,UA,UA,UA,746,UA,N68836,746,12173,HNL,"Honolulu, HI",14771,SFO,"San Francisco, CA",1433,2236,N,303,1,2398,2018,10,31,3,44,14,22


In [4]:
# drop features
train = X.copy()
to_drop = ['mkt_carrier', 'tail_num', 'origin_city_name', 'dest_city_name', 'dup', 'fl_date', 'branded_code_share', 'op_unique_carrier', 'origin', 'dest', 'flights']
train = train.drop(to_drop, axis=1)

In [5]:
train.shape

(9625, 16)

In [6]:
#load features - Passengers, fuel_comsumption
train = load_agg_data(train)

In [7]:
train.shape

(9625, 52)

In [8]:
train.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_fl_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,year,month,day,day_of_week,week_of_year,crs_dep_hour,crs_arr_hour,month_avg_arr_delay,month_avg_air_time,month_avg_carrier_delay,month_avg_weather_delay,month_avg_nas_delay,month_avg_security_delay,month_avg_late_aircraft_delay,day_of_week_avg_arr_delay,day_of_week_avg_air_time,day_of_week_avg_carrier_delay,day_of_week_avg_weather_delay,day_of_week_avg_nas_delay,day_of_week_avg_security_delay,day_of_week_avg_late_aircraft_delay,arr_hour_avg_arr_delay,arr_hour_avg_air_time,arr_hour_avg_carrier_delay,arr_hour_avg_weather_delay,arr_hour_avg_nas_delay,arr_hour_avg_security_delay,arr_hour_avg_late_aircraft_delay,origin_total_flights,dest_total_flights,month_flight_seats,month_flight_passengers,month_flight_avg_seats,month_flight_avg_passengers,carrier_month_seats,carrier_month_passengers,carrier_month_avg_seats,carrier_month_avg_passengers,month_avg_fuel_comsumption,orig_airport_month_flight_seats,orig_airport_month_passengers,dest_airport_month_flight_seats,dest_airport_month_passengers
0,UA,3997,3997,12266,13256,1455,1617,82,316,2018,10,30,2,44,14,16,9.74934,53.23219,36.952381,4.238095,13.892857,0.0,10.690476,8.732231,53.522314,18.540323,12.298387,10.580645,0.0,28.370968,6.329897,53.782032,23.574468,7.212766,8.751773,0.0,23.503546,60613,1764,46933.0,27677.0,150.426277,82.035018,55003329,45704398,169.542505,134.985112,137838100.0,21173178,17176426,405577,288439
1,F9,529,529,10599,11292,1315,1518,183,1083,2018,10,30,2,44,13,15,9.230769,154.653846,9.2,0.0,14.5,0.0,13.6,6.806452,152.129032,8.3,1.4,18.1,0.0,13.2,10.578947,153.894737,12.625,0.0,10.0,0.0,15.625,7183,97485,4728.0,4203.0,176.6,159.452381,9175856,7825867,182.916216,150.583777,18780240.0,1624146,1304863,30526301,26412747
2,AA,2318,2318,11298,12892,1740,1853,193,1235,2018,10,30,2,44,17,18,12.06078,167.094037,21.235808,11.449782,8.200873,0.0,23.078603,2.587422,167.238424,22.679245,4.456604,9.173585,0.0,23.649057,13.709353,168.21295,25.615,5.91,8.015,0.0,22.035,100056,83560,414528.0,376739.0,215.590837,192.751409,74614837,62505008,168.496511,136.733012,181929700.0,33551297,27909683,41526408,34347751
3,AA,2375,2375,11298,14869,1855,2048,173,989,2018,10,30,2,44,18,20,15.659942,138.567723,14.092784,8.505155,7.14433,0.0,34.474227,9.633648,139.055031,25.697368,4.361842,5.651316,0.0,20.118421,22.419118,138.664216,16.912281,6.333333,6.853801,0.0,25.140351,100056,40380,128478.0,107398.0,157.619886,128.523313,74614837,62505008,168.496511,136.733012,181929700.0,33551297,27909683,11730371,9839898
4,UA,746,746,12173,14771,1433,2236,303,2398,2018,10,31,3,44,14,22,0.732704,274.103774,13.916667,0.416667,11.416667,0.0,26.527778,3.350202,269.595142,22.764706,0.0,7.558824,0.0,40.5,3.611321,274.69434,9.09434,0.0,10.377358,0.0,28.132075,18549,59900,195967.0,160718.0,241.446,194.466981,55003329,45704398,169.542505,134.985112,137838100.0,9959578,8131626,28213693,22922751


In [9]:
# Save X and y to disk
# Save data in local disk

y.to_csv('./data/target.csv', index=False)

In [10]:
sorted(list(train.columns))

['arr_hour_avg_air_time',
 'arr_hour_avg_arr_delay',
 'arr_hour_avg_carrier_delay',
 'arr_hour_avg_late_aircraft_delay',
 'arr_hour_avg_nas_delay',
 'arr_hour_avg_security_delay',
 'arr_hour_avg_weather_delay',
 'carrier_month_avg_passengers',
 'carrier_month_avg_seats',
 'carrier_month_passengers',
 'carrier_month_seats',
 'crs_arr_hour',
 'crs_arr_time',
 'crs_dep_hour',
 'crs_dep_time',
 'crs_elapsed_time',
 'day',
 'day_of_week',
 'day_of_week_avg_air_time',
 'day_of_week_avg_arr_delay',
 'day_of_week_avg_carrier_delay',
 'day_of_week_avg_late_aircraft_delay',
 'day_of_week_avg_nas_delay',
 'day_of_week_avg_security_delay',
 'day_of_week_avg_weather_delay',
 'dest_airport_id',
 'dest_airport_month_flight_seats',
 'dest_airport_month_passengers',
 'dest_total_flights',
 'distance',
 'mkt_carrier_fl_num',
 'mkt_unique_carrier',
 'month',
 'month_avg_air_time',
 'month_avg_arr_delay',
 'month_avg_carrier_delay',
 'month_avg_fuel_comsumption',
 'month_avg_late_aircraft_delay',
 'month_

In [11]:
# Remove nulls for numeric values
train_n_null = num_null_replacement(train)

Missing data:
-------------
                                     Total   Percent
month_flight_avg_passengers           1915  0.195249
month_flight_seats                    1915  0.195249
month_flight_passengers               1915  0.195249
month_flight_avg_seats                1915  0.195249
arr_hour_avg_weather_delay              29  0.002957
arr_hour_avg_security_delay             29  0.002957
arr_hour_avg_late_aircraft_delay        29  0.002957
arr_hour_avg_nas_delay                  29  0.002957
arr_hour_avg_carrier_delay              29  0.002957
month_avg_late_aircraft_delay           22  0.002243
month_avg_security_delay                22  0.002243
month_avg_nas_delay                     22  0.002243
month_avg_weather_delay                 22  0.002243
month_avg_carrier_delay                 22  0.002243
day_of_week_avg_nas_delay                5  0.000510
day_of_week_avg_weather_delay            5  0.000510
day_of_week_avg_security_delay           5  0.000510
day_of_week_avg_la

In [12]:
train_n_null.dtypes[train_n_null.dtypes == 'object']

mkt_unique_carrier    object
dtype: object

In [13]:
train_cleaned = scale_encoder(train_n_null)

NameError: name 'scale_encoder' is not defined

In [None]:
train_cleaned.to_csv('./data/train_cleaned.csv')