#### This notebook is where we apply the features that we have engineered to the test set so that the model can be run on it

In [1]:
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import StandardScaler

from functions import statebreaker, get_hour, get_min, delay_binarizer, haul_type

In [2]:
data = pd.read_csv('data/flights_test_weather.csv')
data.shape

(150623, 32)

In [3]:
data.head(20)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,...,dep_hour,arr_hour,dep_period,arr_period,origin_match_code,dest_match_code,origin_weather_condition,origin_weather_severity,dest_weather_condition,dest_weather_severity
0,0,0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,...,18,19,Evening,Evening,ONT2020-01-01Evening,SFO2020-01-01Evening,Clear,Light,Clear,Light
1,1,1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,...,11,13,Morning,Afternoon,ONT2020-01-01Morning,SFO2020-01-01Afternoon,Clear,Light,Clear,Light
2,2,2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,...,20,21,Evening,Evening,ONT2020-01-01Evening,SJC2020-01-01Evening,Clear,Light,Clear,Light
3,3,3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,...,13,14,Afternoon,Afternoon,ONT2020-01-01Afternoon,SJC2020-01-01Afternoon,Clear,Light,Clear,Light
4,4,4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,...,9,10,Morning,Morning,ONT2020-01-01Morning,SJC2020-01-01Morning,Clear,Light,Clear,Light
5,5,5,2020-01-01,WN,WN,WN,5684,WN,N7856A,5684,...,6,7,Morning,Morning,ONT2020-01-01Morning,SJC2020-01-01Morning,Clear,Light,Clear,Light
6,6,6,2020-01-01,WN,WN,WN,6152,WN,N7735A,6152,...,16,17,Afternoon,Afternoon,ONT2020-01-01Afternoon,SJC2020-01-01Afternoon,Clear,Light,Clear,Light
7,7,7,2020-01-01,WN,WN,WN,1679,WN,N405WN,1679,...,15,16,Afternoon,Afternoon,ONT2020-01-01Afternoon,SMF2020-01-01Afternoon,Clear,Light,Fog,Moderate
8,8,8,2020-01-01,WN,WN,WN,3479,WN,N489WN,3479,...,12,13,Afternoon,Afternoon,ONT2020-01-01Afternoon,SMF2020-01-01Afternoon,Clear,Light,Fog,Moderate
9,9,9,2020-01-01,WN,WN,WN,4069,WN,N7708E,4069,...,7,9,Morning,Morning,ONT2020-01-01Morning,SMF2020-01-01Morning,Clear,Light,Clear,Light


# Make the predictions csv file

In [4]:
# Make a dataframe for the predictions csv file

pred_frame = data[['fl_date',
                  'mkt_carrier',
                  'mkt_carrier_fl_num',
                  'origin',
                  'dest']]
pred_frame

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest
0,2020-01-01,WN,5888,ONT,SFO
1,2020-01-01,WN,6276,ONT,SFO
2,2020-01-01,WN,4598,ONT,SJC
3,2020-01-01,WN,4761,ONT,SJC
4,2020-01-01,WN,5162,ONT,SJC
...,...,...,...,...,...
150618,2020-01-07,DL,4813,DTW,JFK
150619,2020-01-07,DL,4814,GSP,LGA
150620,2020-01-07,DL,4815,ATL,XNA
150621,2020-01-07,DL,4815,XNA,ATL


In [5]:
pred_frame.to_csv('pre_predictions.csv')

# Process the data

In [6]:
# Break fl_date up into it's component day, month, year components


data['year'] = pd.DatetimeIndex(data['fl_date']).year
data['month'] = pd.DatetimeIndex(data['fl_date']).month
data['day'] = pd.DatetimeIndex(data['fl_date']).day

# Get day of week from flight date
data['week_day'] = pd.DatetimeIndex(data['fl_date']).dayofweek
data['week_day'] = data['week_day'].replace({0: 'Monday',
                                         1: 'Tuesday',
                                         2: 'Wednesday',
                                         3: 'Thursday',
                                         4: 'Friday',
                                         5: 'Saturday',
                                         6: 'Sunday'},
                                           )


In [7]:
# Break up dep_time into it's component hours and minutes

data['crs_dep_hour'] = data['crs_dep_time'].apply(get_hour)
data['crs_dep_minutes'] = data['crs_dep_time'].apply(get_min)

# Let's do the same for crs_arr_time
data['crs_arr_hour'] = data['crs_arr_time'].apply(get_hour)
data['crs_arr_minutes'] = data['crs_arr_time'].apply(get_min)

In [8]:
# Separate the state from the city name using statebreaker.

data['dep_state'] = data['origin_city_name'].apply(statebreaker)
data['dest_state'] = data['dest_city_name'].apply(statebreaker)

In [9]:
# Make a feature name for the type of haul
data['haul_type'] = data['crs_elapsed_time'].apply(haul_type)


In [10]:
# Make a feature name representing the route as a combination of origin and departure

data['route'] = data['origin'].astype(str) + '_' + data['dest'].astype(str)

In [11]:
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,...,day,week_day,crs_dep_hour,crs_dep_minutes,crs_arr_hour,crs_arr_minutes,dep_state,dest_state,haul_type,route
0,0,0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,...,1,Wednesday,18,10,19,45,CA,CA,Short,ONT_SFO
1,1,1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,...,1,Wednesday,11,50,13,20,CA,CA,Short,ONT_SFO
2,2,2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,...,1,Wednesday,20,20,21,30,CA,CA,Short,ONT_SJC
3,3,3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,...,1,Wednesday,13,40,14,55,CA,CA,Short,ONT_SJC
4,4,4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,...,1,Wednesday,9,15,10,35,CA,CA,Short,ONT_SJC


### Load the feature engineering dictionaries

In [12]:
# Carrier on-time percent
with open(f'data/dicts/carrier_pct.json') as my_file:
    temp = (my_file.read())
    carrier_pct = json.loads(temp)
    
# Plane(tail number) on-time percent
with open(f'data/dicts/plane_pct.json') as my_file:
    temp = (my_file.read())
    plane_pct = json.loads(temp)
    
# On-time percent for planes leaving origin airport
with open(f'data/dicts/origin_pct.json') as my_file:
    temp = (my_file.read())
    origin_pct = json.loads(temp)

# On-time percent for flight number
with open(f'data/dicts/fl_num_pct.json') as my_file:
    temp = (my_file.read())
    fl_num_pct = json.loads(temp)

# On-time percent for route
with open(f'data/dicts/route_pct.json') as my_file:
    temp = (my_file.read())
    route_pct = json.loads(temp)

# Fuel consumption per passenger per mile by carrier 
with open(f'data/dicts/fuel_dict.json') as my_file:
    temp = (my_file.read())
    fuel_dict = json.loads(temp)
    
# Mean taxi-out time for each airport
with open(f'data/dicts/mean_taxi_out.json') as my_file:
    temp = (my_file.read())
    mean_taxi_out = json.loads(temp)

# Mean taxi-in time for each airport
with open(f'data/dicts/mean_taxi_in.json') as my_file:
    temp = (my_file.read())
    mean_taxi_in = json.loads(temp)
    
# Mean arrival delay of each carrier
with open(f'data/dicts/carrier_mean_delay.json') as my_file:
    temp = (my_file.read())
    carrier_mean_delay = json.loads(temp)
    
# Mean departure delay for each origin
with open(f'data/dicts/origin_mean_dep_delay.json') as my_file:
    temp = (my_file.read())
    origin_mean_dep_delay = json.loads(temp)
    
# Mean arrival delay for each destination
with open(f'data/dicts/dest_mean_arr_delay.json') as my_file:
    temp = (my_file.read())
    dest_mean_arr_delay = json.loads(temp)



In [13]:
# Set mkt_carrier_fl_num as type str for mapping to work
data['mkt_carrier_fl_num'] = data['mkt_carrier_fl_num'].astype(str)

In [14]:
# Map severities of 'Heavy, other and UNK' to severe based on the high propensity for causing delays
data['origin_weather_severity'] = data['origin_weather_severity'].replace({'Heavy': 'Severe', 
                                                                          'Other': 'Severe',
                                                                          'UNK': 'Severe'},
                                                                         )
data['dest_weather_severity'] = data['dest_weather_severity'].replace({'Heavy': 'Severe', 
                                                                          'Other': 'Severe',
                                                                          'UNK': 'Severe'}
                                                                    )

In [15]:
# Simplify conditions to adverse, precipitation, storm or clear
data['origin_weather_condition'] = data['origin_weather_condition'].replace({'Rain': 'Precipitation', 
                                                                          'Snow': 'Precipitation',
                                                                          'Hail': 'Adverse',
                                                                           'Storm': 'Storm',
                                                                            'Clear': 'Clear',
                                                                            'Fog': 'Adverse',
                                                                            'Cold': 'Adverse',
                                                                            'Precipitation': 'Precipitation'},
                                                                         )

data['dest_weather_condition'] = data['dest_weather_condition'].replace({'Rain': 'Precipitation', 
                                                                          'Snow': 'Precipitation',
                                                                          'Hail': 'Adverse',
                                                                           'Storm': 'Storm',
                                                                            'Clear': 'Clear',
                                                                            'Fog': 'Adverse',
                                                                            'Cold': 'Adverse',
                                                                            'Precipitation': 'Precipitation'},
                                                                         )

### Mapping the engineered features to their proper values in the test data 

In [16]:
# Map the engineered features to the data

# Map the engineered features to the data

data['carrier_on_time_pct'] = data['mkt_unique_carrier'].map(carrier_pct) # How often is the carrier on time
data['carrier_mean_delay'] = data['mkt_unique_carrier'].map(carrier_mean_delay) # How far off schedule is this carrier
data['plane_on_time_pct'] = data['tail_num'].map(plane_pct) # How often is this plane on time
data['origin_on_time_pct'] = data['origin'].map(origin_pct) # How often are planes that leave this airport on time
data['origin_mean_dep_delay'] = data['origin'].map(origin_mean_dep_delay) # Origin mean departure delay
data['dest_mean_arr_delay'] = data['dest'].map(dest_mean_arr_delay) # Destination mean arrival delay
data['fl_num_on_time_pct'] = data['mkt_carrier_fl_num'].map(fl_num_pct) # How often is the flight on time
data['route_on_time_pct'] = data['route'].map(route_pct) # How often is the route on time
data['route_on_time_pct'] = data['route_on_time_pct'].replace({np.inf: 1}) 
data['carrier_fuel_consumption'] = data['mkt_unique_carrier'].map(fuel_dict) #Carrier fuel consumption
data['mean_taxi_out'] = data['origin'].map(mean_taxi_out).rename({'mean_taxi_out': 'origin_mean_taxi_out'})
data['mean_taxi_in'] = data['origin'].map(mean_taxi_in).rename({'mean_taxi_in': 'dest_mean_taxi_in'})

In [17]:
data.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,...,carrier_mean_delay,plane_on_time_pct,origin_on_time_pct,origin_mean_dep_delay,dest_mean_arr_delay,fl_num_on_time_pct,route_on_time_pct,carrier_fuel_consumption,mean_taxi_out,mean_taxi_in
count,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,...,150623.0,150404.0,150623.0,150623.0,150623.0,150618.0,150604.0,150623.0,150623.0,150623.0
mean,75311.0,75311.0,2825.490582,12700.684344,12700.653333,1334.061153,1485.52577,143.286185,1.0,789.441739,...,-8.297296,0.762416,0.764244,-1.782487,-8.300437,0.764412,0.7629,1.42955e-07,17.409145,7.756661
std,43481.259135,43481.259135,1862.574715,1523.101482,1522.773801,493.774648,524.008413,73.111577,0.0,593.936415,...,2.141833,0.062541,0.041541,1.376098,1.64695,0.051064,0.074748,1.985567e-07,3.339719,1.400305
min,0.0,0.0,1.0,10135.0,10135.0,2.0,1.0,22.0,1.0,31.0,...,-11.928509,0.0,0.4375,-11.215686,-25.295918,0.0,0.0,6.758333e-08,3.094595,3.809524
25%,37655.5,37655.5,1196.0,11292.0,11292.0,919.0,1102.0,90.0,1.0,354.0,...,-9.15794,0.723881,0.727217,-2.383998,-9.09823,0.734513,0.716981,6.758333e-08,15.062034,6.636431
50%,75311.0,75311.0,2437.0,12889.0,12889.0,1325.0,1515.0,125.0,1.0,629.0,...,-6.95693,0.76087,0.764816,-1.620071,-8.286288,0.76699,0.765534,7.51439e-08,17.120346,7.535373
75%,112966.5,112966.5,4490.0,14057.0,14057.0,1740.0,1920.0,174.0,1.0,1028.0,...,-6.95693,0.801242,0.796961,-1.065628,-7.237896,0.797688,0.811282,8.775066e-08,19.8,8.307292
max,150622.0,150622.0,6799.0,16869.0,16869.0,2359.0,2400.0,700.0,1.0,5095.0,...,-2.143562,1.0,1.0,4.4375,0.903846,1.0,1.0,1.503534e-06,28.482143,18.392157


Based on the descriptive table above, we can see that there are some values missing for plane_on_time_pct, fl_num_on_time_pct, route_on_time_pct.<br>
These columns are based on flight numbers, planes and routes that were not present in the test set.  Let's fill the values with the means of their respective columns.

In [18]:
data['plane_on_time_pct'] = data['plane_on_time_pct'].fillna(data['plane_on_time_pct'].mean())
data['fl_num_on_time_pct'] = data['fl_num_on_time_pct'].fillna(data['fl_num_on_time_pct'].mean())
data['route_on_time_pct'] = data['route_on_time_pct'].fillna(data['route_on_time_pct'].mean())

In [19]:
data.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,...,carrier_mean_delay,plane_on_time_pct,origin_on_time_pct,origin_mean_dep_delay,dest_mean_arr_delay,fl_num_on_time_pct,route_on_time_pct,carrier_fuel_consumption,mean_taxi_out,mean_taxi_in
count,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,...,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0,150623.0
mean,75311.0,75311.0,2825.490582,12700.684344,12700.653333,1334.061153,1485.52577,143.286185,1.0,789.441739,...,-8.297296,0.762416,0.764244,-1.782487,-8.300437,0.764412,0.7629,1.42955e-07,17.409145,7.756661
std,43481.259135,43481.259135,1862.574715,1523.101482,1522.773801,493.774648,524.008413,73.111577,0.0,593.936415,...,2.141833,0.062496,0.041541,1.376098,1.64695,0.051063,0.074743,1.985567e-07,3.339719,1.400305
min,0.0,0.0,1.0,10135.0,10135.0,2.0,1.0,22.0,1.0,31.0,...,-11.928509,0.0,0.4375,-11.215686,-25.295918,0.0,0.0,6.758333e-08,3.094595,3.809524
25%,37655.5,37655.5,1196.0,11292.0,11292.0,919.0,1102.0,90.0,1.0,354.0,...,-9.15794,0.723881,0.727217,-2.383998,-9.09823,0.734513,0.716981,6.758333e-08,15.062034,6.636431
50%,75311.0,75311.0,2437.0,12889.0,12889.0,1325.0,1515.0,125.0,1.0,629.0,...,-6.95693,0.760976,0.764816,-1.620071,-8.286288,0.76699,0.765534,7.51439e-08,17.120346,7.535373
75%,112966.5,112966.5,4490.0,14057.0,14057.0,1740.0,1920.0,174.0,1.0,1028.0,...,-6.95693,0.80117,0.796961,-1.065628,-7.237896,0.797688,0.811282,8.775066e-08,19.8,8.307292
max,150622.0,150622.0,6799.0,16869.0,16869.0,2359.0,2400.0,700.0,1.0,5095.0,...,-2.143562,1.0,1.0,4.4375,0.903846,1.0,1.0,1.503534e-06,28.482143,18.392157


In [20]:
data.shape

(150623, 55)

### Columns used in final model training

distance                    <br>
origin_weather_condition     <br>
origin_weather_severity      <br>
dest_weather_condition       <br>
dest_weather_severity        <br>
week_day                     <br>
crs_dep_hour                  <br>
crs_arr_hour                  <br>
carrier_on_time_pct         <br>
plane_on_time_pct           <br>
origin_on_time_pct          <br>
fl_num_on_time_pct          <br>
route_on_time_pct           <br>
carrier_fuel_consumption    <br>
mean_taxi_out               <br>
mean_taxi_in     <br>

In [21]:
test_data = data[['distance',
                 'origin_weather_condition',
                 'origin_weather_severity',
                 'dest_weather_condition',
                 'dest_weather_severity',
                 'week_day',
                 'crs_dep_hour',
                 'crs_arr_hour',
                 'carrier_on_time_pct',
                 'plane_on_time_pct',
                  'origin_mean_dep_delay',
                 'dest_mean_arr_delay',
                  'carrier_mean_delay',
                 'origin_on_time_pct',
                 'fl_num_on_time_pct',
                 'route_on_time_pct',
                 'carrier_fuel_consumption',
                 'mean_taxi_out',
                 'mean_taxi_in']]

In [22]:
test_data

Unnamed: 0,distance,origin_weather_condition,origin_weather_severity,dest_weather_condition,dest_weather_severity,week_day,crs_dep_hour,crs_arr_hour,carrier_on_time_pct,plane_on_time_pct,origin_mean_dep_delay,dest_mean_arr_delay,carrier_mean_delay,origin_on_time_pct,fl_num_on_time_pct,route_on_time_pct,carrier_fuel_consumption,mean_taxi_out,mean_taxi_in
0,363,Clear,Light,Clear,Light,Wednesday,18,19,0.746799,0.765306,-0.807644,-9.703227,-6.382972,0.736529,0.655629,0.806452,7.156227e-08,11.848058,7.106203
1,363,Clear,Light,Clear,Light,Wednesday,11,13,0.746799,0.709343,-0.807644,-9.703227,-6.382972,0.736529,0.744444,0.806452,7.156227e-08,11.848058,7.106203
2,333,Clear,Light,Clear,Light,Wednesday,20,21,0.746799,0.762931,-0.807644,-7.476616,-6.382972,0.736529,0.773913,0.728889,7.156227e-08,11.848058,7.106203
3,333,Clear,Light,Clear,Light,Wednesday,13,14,0.746799,0.701923,-0.807644,-7.476616,-6.382972,0.736529,0.796992,0.728889,7.156227e-08,11.848058,7.106203
4,333,Clear,Light,Clear,Light,Wednesday,9,10,0.746799,0.794643,-0.807644,-7.476616,-6.382972,0.736529,0.787500,0.728889,7.156227e-08,11.848058,7.106203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150618,509,Clear,Light,Precipitation,Light,Tuesday,17,19,0.830845,0.870588,-1.938073,-9.028588,-11.928509,0.845013,0.765823,0.839196,7.514390e-08,20.853673,7.171503
150619,610,Precipitation,Light,Clear,Light,Tuesday,6,7,0.830845,0.869281,-2.856481,-9.344592,-11.928509,0.792929,0.761364,0.800000,7.514390e-08,15.980219,10.513047
150620,589,Clear,Light,Clear,Light,Tuesday,17,18,0.830845,0.886010,-0.435523,-8.395664,-11.928509,0.813196,0.781421,0.872000,7.514390e-08,15.823849,6.135301
150621,589,Clear,Light,Clear,Light,Tuesday,18,21,0.830845,0.886010,-4.019344,-9.616498,-11.928509,0.733477,0.781421,0.778210,7.514390e-08,16.597528,12.096722


# Making dummy variables and scaling numeric columns

In [23]:
def dummify(df, column):
    temp_list = list(df[column])
    temp_df = pd.DataFrame(temp_list, columns = [column])
    dummy_df = pd.get_dummies(temp_df, columns = [column], prefix = ['type_is'])
    return dummy_df

In [24]:
# Get lists of numeric columns and object columns to facilitate making dummy variables and scaling

numeric_cols = ['distance',
               'crs_dep_hour',
               'crs_arr_hour',
               'carrier_on_time_pct',
               'plane_on_time_pct',
               'fl_num_on_time_pct',
               'origin_on_time_pct',
               'route_on_time_pct',
               'mean_taxi_out', 
                'mean_taxi_in',
               'carrier_fuel_consumption',
                'carrier_mean_delay',
                'origin_mean_dep_delay',
                'dest_mean_arr_delay'
               ]
obj_cols = test_data.drop(columns = numeric_cols).columns.to_list()

In [25]:
obj_cols

['origin_weather_condition',
 'origin_weather_severity',
 'dest_weather_condition',
 'dest_weather_severity',
 'week_day']

In [26]:
# Create Dummy variables for the object columns

for col in obj_cols:
    temp_df = dummify(data, col)
    test_data = test_data.merge(temp_df, 
                               left_index = True, 
                               right_index = True)

In [27]:
# Drop object columns

test_data = test_data.drop(columns = obj_cols)

In [28]:
# Apply scaling to the numeric columns
scaler = StandardScaler()
numerics_scaled = scaler.fit_transform(test_data[numeric_cols])

In [29]:
# Rename columns from scaling back to their original names, merge with data

numeric_dict = {0: 'distance',
               1: 'crs_dep_hour',
               2: 'crs_arr_hour',
               3: 'carrier_on_time_pct',
               4: 'plane_on_time_pct',
               5: 'fl_num_on_time_pct',
               6: 'origin_on_time_pct',
               7: 'route_on_time_pct',
               8: 'origin_taxi_out',
               9: 'carrier_fuel_consumption',
               10: 'mean_taxi_in',
               11: 'carrier_mean_delay',
               12: 'origin_mean_dep_delay',
               13: 'dest_mean_arr_delay'}

numerics_scaled = pd.DataFrame(numerics_scaled).rename(columns = numeric_dict)
test_data = test_data.drop(columns = numeric_cols)
test_data = test_data.merge(numerics_scaled, left_index=True, right_index=True)

In [30]:
test_data = test_data[['type_is_Adverse_x',
 'type_is_Clear_x',
 'type_is_Storm_x',
 'type_is_Moderate_x',
 'type_is_Severe_x',
 'type_is_Adverse_y',
 'type_is_Clear_y',
 'type_is_Storm_y',
 'type_is_Severe_y',
 'type_is_Friday',
 'type_is_Monday',
 'type_is_Saturday',
 'type_is_Sunday',
 'type_is_Thursday',
 'type_is_Tuesday',
 'type_is_Wednesday',
 'distance',
 'crs_dep_hour',
 'crs_arr_hour',
 'carrier_on_time_pct',
 'plane_on_time_pct',
 'fl_num_on_time_pct',
 'origin_on_time_pct',
 'route_on_time_pct',
 'origin_taxi_out',
 'carrier_fuel_consumption',
 'mean_taxi_in',
 'origin_mean_dep_delay',
 'dest_mean_arr_delay']]

In [31]:
test_data.dtypes

type_is_Adverse_x             uint8
type_is_Clear_x               uint8
type_is_Storm_x               uint8
type_is_Moderate_x            uint8
type_is_Severe_x              uint8
type_is_Adverse_y             uint8
type_is_Clear_y               uint8
type_is_Storm_y               uint8
type_is_Severe_y              uint8
type_is_Friday                uint8
type_is_Monday                uint8
type_is_Saturday              uint8
type_is_Sunday                uint8
type_is_Thursday              uint8
type_is_Tuesday               uint8
type_is_Wednesday             uint8
distance                    float64
crs_dep_hour                float64
crs_arr_hour                float64
carrier_on_time_pct         float64
plane_on_time_pct           float64
fl_num_on_time_pct          float64
origin_on_time_pct          float64
route_on_time_pct           float64
origin_taxi_out             float64
carrier_fuel_consumption    float64
mean_taxi_in                float64
origin_mean_dep_delay       

In [32]:
test_data.to_csv('data/test_data_prepped.csv')