In [4]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling
import pandas as pd


# Training data is collected from all samples in the first and last week of january.
- ### First week of january provides variance from historic examples from a similar time of year.
- ### Last week of january provides proximity to our testing target.

In [5]:
train_flights = preprocessing.get_train_flights()
train_flights.shape


(285359, 11)

In [6]:
train_flights['fl_date'].unique()

array(['2019-12-25', '2019-12-26', '2019-12-27', '2019-12-28',
       '2019-12-29', '2019-12-30', '2019-12-31', '2019-01-01',
       '2019-01-02', '2019-01-03', '2019-01-04', '2019-01-05',
       '2019-01-06', '2019-01-07'], dtype=object)

# Features:
- ### arrival times and departure times are converted into sine and cosine features to represent day cycles
- ### date has been converted to day of week
- ### flight number, carrier, tail number, destination, and origin have been replaced with sets of averages calculated from the entirity of the available data (minus outliers) an
- ### weather events have been collected for destination and origin airports, and given a value according to maximum severity of that weather on that day

In [7]:
print(f'before: {train_flights.shape}')
train_flights = preparation.build_all_features(train_flights)
train_flights.head(1)
print(f'after: {train_flights.shape}')

before: (285359, 11)
after: (285358, 47)


In [8]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)


(199750, 46)
(199750,)
(85608, 46)
(85608,)


In [9]:
from sklearn.preprocessing import StandardScaler, RobustScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val],scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(199750, 42)
(85608, 42)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_late_aircraft_delay,fl_num_avg_taxi_out,tail_num_avg_arr_delay,tail_num_avg_dep_delay,...,dest_rain,dest_snow,dest_storm,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
22961,-0.909384,-0.415957,-0.709034,-0.705174,-1.003725,-1.001028,-0.723383,-1.87006,-2.217024,-1.234676,...,-0.690764,-0.278847,-0.08056,0,0,0,0,1,0,0


In [11]:
### Final features:
X_train.columns

Index(['arr_time_sin', 'arr_time_cos', 'dep_time_sin', 'dep_time_cos',
       'fl_num_avg_arr_delay', 'fl_num_avg_dep_delay',
       'fl_num_avg_late_aircraft_delay', 'fl_num_avg_taxi_out',
       'tail_num_avg_arr_delay', 'tail_num_avg_dep_delay',
       'tail_num_avg_taxi_out', 'tail_num_avg_late_aircraft_delay',
       'carrier_avg_arr_delay', 'carrier_avg_dep_delay',
       'carrier_avg_carrier_delay', 'dest_avg_arr_delay', 'dest_avg_dep_delay',
       'dest_avg_taxi_in', 'origin_avg_arr_delay', 'origin_avg_dep_delay',
       'origin_avg_taxi_out', 'distance', 'crs_elapsed_time', 'origin_cold',
       'origin_fog', 'origin_hail', 'origin_rain', 'origin_snow',
       'origin_storm', 'dest_cold', 'dest_fog', 'dest_hail', 'dest_rain',
       'dest_snow', 'dest_storm', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6'],
      dtype='object')

# Baseline Linear Regression test

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
notes = ""
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

{'r2_score': 0.0925578340876233, 'mean_squared_error': 700.8800243808258, 'mean_absolute_error': 18.02390904052159, 'explained_variance_score': 0.09256540473595054}


LinearRegression()

In [13]:
modeling.get_records().tail(1)

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score,f1_score,recall_score,precision_score,accuracy_score
108.pickle,LinearRegression,,0.415421,0.092558,700.880024,18.023909,0.092565,,,,
