In [1]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling
# from custom_scripts import weather
import pandas as pd
import numpy as np

In [32]:
def create_average_table(feature,prefix):
    return database.query(f"""
                            SELECT  {feature},
                                    AVG(dep_delay) AS "{prefix}_avg_dep_delay", 
                                    AVG(taxi_out) AS "{prefix}_avg_taxi_out",
                                    AVG(wheels_off) AS "{prefix}_avg_wheels_off", 
                                    AVG(wheels_on) AS "{prefix}_avg_wheels_on", 
                                    AVG(taxi_in) AS "{prefix}_avg_taxi_in", 
                                    AVG(arr_delay) AS "{prefix}_avg_arr_delay",
                                    AVG(crs_elapsed_time) AS "{prefix}_avg_crs_elapsed_time",
                                    AVG(actual_elapsed_time) AS "{prefix}_avg_actual_elapsed_time",
                                    AVG(air_time) AS "{prefix}_avg_air_time",
                                    AVG(carrier_delay) AS "{prefix}_avg_carrier_delay",
                                    AVG(weather_delay) AS "{prefix}_avg_weather_delay",
                                    AVG(nas_delay) AS "{prefix}_avg_nas_delay", 
                                    AVG(security_delay) AS "{prefix}_avg_security_delay",
                                    AVG(late_aircraft_delay) AS "{prefix}_avg_late_aircraft_delay",
                                    AVG(total_add_gtime) AS "{prefix}_avg_total_add_gtime",
                                    AVG(longest_add_gtime) AS "{prefix}_avg_longest_add_gtime"                                      
                                FROM flights
                                WHERE fl_date = ANY('{{2018-12-31, 2018-12-30, 2018-12-29, 2018-12-28, 2018-12-27, 2018-12-26, 2018-12-25}}')
                                GROUP BY {feature};
                            """)
avg_table = create_average_table('op_carrier_fl_num','fl_num')
avg_table.to_csv('../data/preprocessing/avg_fl_num_features_2018-12-25_to_2018-12-31.csv', index=False)


In [19]:
PRIMARY_TEST_FEATURES = """ fl_date, 
                        op_unique_carrier, 
                        op_carrier_fl_num, 
                        origin, 
                        dest, 
                        crs_dep_time,
                        crs_arr_time, 
                        crs_elapsed_time, 
                        distance, 
                        arr_delay
                    """


def get_jan_2019_validation_flights(features:str=PRIMARY_TEST_FEATURES) -> pd.DataFrame:
    """ Returns dataframe of flights from the first week of january 2020"""
    return database.query(f""" 
                            SELECT {features}
                                FROM flights
                                WHERE fl_date = ANY('{{2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01-05, 2019-01-06, 2019-01-07}}')
                                ;
                            """)
def get_dec_2018_training_flights(features:str=PRIMARY_TEST_FEATURES) -> pd.DataFrame:
    """ Returns dataframe of flights from the first week of january 2020"""
    return database.query(f""" 
                            SELECT {features}
                                FROM flights
                                WHERE fl_date = ANY('{{2018-12-31, 2018-12-30, 2018-12-29, 2018-12-28, 2018-12-27, 2018-12-26, 2018-12-25}}')
                                ;
                            """)

In [20]:
# validation_data = get_jan_2019_validation_flights()
# validation_data = validation_data[validation_data['arr_delay'].notnull()]
# validation_data.to_csv('../../data/local/jan_2019_validation_flights', index=False)
validation_data = pd.read_csv('../data/local/jan_2019_validation_flights')
print(validation_data.shape)
validation_data.head(1)

(145053, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
0,2019-01-01,OO,3368,SJC,AUS,1805,2337,212.0,1476.0,9.0


In [21]:
# training_data = get_dec_2018_training_flights()
# training_data = training_data[training_data['arr_delay'].notnull()]
# training_data = training_data[training_data['op_carrier_fl_num'].isin(validation_data['op_carrier_fl_num'].values)]
# training_data.to_csv('../../data/local/dec_2018_training_flights', index=False)
training_data = pd.read_csv('../data/local/dec_2018_training_flights')
print(training_data.shape)
training_data.head(1)

(143158, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
0,2018-12-25,WN,566,DEN,LGA,1745,2320,215.0,1620.0,-17.0


In [22]:
def build_average_features(flight_data: pd.DataFrame) -> pd.DataFrame:
    """ Returns dataframe with added historic average features """
    average_delays = pd.read_csv('../data/preprocessing/avg_fl_num_features_2018-12-25_to_2018-12-31.csv')
    return pd.merge(flight_data.copy(), average_delays, on='op_carrier_fl_num')

In [23]:
avgs = pd.read_csv('../data/preprocessing/avg_fl_num_features_2018-12-25_to_2018-12-31.csv')
avgs.isna().sum()

op_carrier_fl_num                    0
fl_num_avg_dep_delay                 1
fl_num_avg_taxi_out                  1
fl_num_avg_wheels_off                1
fl_num_avg_wheels_on                 1
fl_num_avg_taxi_in                   1
fl_num_avg_arr_delay                 1
fl_num_avg_crs_elapsed_time          0
fl_num_avg_actual_elapsed_time       1
fl_num_avg_air_time                  1
fl_num_avg_carrier_delay           513
fl_num_avg_weather_delay           513
fl_num_avg_nas_delay               513
fl_num_avg_security_delay          513
fl_num_avg_late_aircraft_delay     513
fl_num_avg_total_add_gtime        6056
fl_num_avg_longest_add_gtime      6056
dtype: int64

In [24]:
X_train = training_data.drop('arr_delay',1)
X_val = validation_data.drop('arr_delay',1)
y_train = training_data['arr_delay']
y_val = validation_data['arr_delay']
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(143158, 9)
(143158,)
(145053, 9)
(145053,)


In [25]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
# X_train = preparation.build_all_features(X_train)
# X_train = build_average_features(X_train)
X_train = preparation.build_historic_average_features(X_train)
X_train = preparation.build_time_features(X_train)
X_train = preparation.build_day_features(X_train)
X_train = preparation.build_weather_features(X_train)

# X_val = preparation.build_all_features(X_val)
# X_val = build_average_features(X_val)
X_val = preparation.build_historic_average_features(X_val)
X_val = preparation.build_time_features(X_val)
X_val = preparation.build_day_features(X_val)
X_val = preparation.build_weather_features(X_val)
print('\n')
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

X_train.head(1)

(143158, 9)
(143158,)
(145053, 9)
(145053,)


(143158, 39)
(143158,)
(145053, 39)
(145053,)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,fl_num_avg_dep_delay,...,day_of_year,day_of_week,Severity,Rain,Fog,Cold,Storm,Precipitation,Snow,Hail
0,2018-12-25,WN,566,DEN,LGA,1745,2320,215.0,1620.0,13.5238,...,359,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
X_train.isna().sum().value_counts()

0       37
1941     2
dtype: int64

In [27]:
from sklearn.preprocessing import StandardScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train.fillna(0), X_val.fillna(0)], scaler=StandardScaler())

X_train.head(1)

Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,Cold,Fog,Hail,Precipitation,Rain,Snow,...,day_of_year_363,day_of_year_364,day_of_year_365,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,-0.169466,0.985536,-0.998065,-0.06218,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [28]:
X_train.drop(['day_of_year_359', 'day_of_year_360','day_of_year_361', 'day_of_year_362', 'day_of_year_363','day_of_year_364', 'day_of_year_365',],1, inplace=True)
X_val.drop(['day_of_year_1', 'day_of_year_2','day_of_year_3', 'day_of_year_4', 'day_of_year_5','day_of_year_6', 'day_of_year_7',],1, inplace=True)

In [11]:
from sklearn.linear_model import LinearRegression

In [29]:
modeling.run_test(X_train,X_val,y_train,y_val,LinearRegression(),'week before. historic average')

{'r2_score': -0.022938833323258745, 'mean_squared_error': 2353.8753094862, 'mean_absolute_error': 26.238905225850512, 'explained_variance_score': -0.004109291292295092}


LinearRegression()

In [30]:
modeling.get_records()

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score
1.pickle,LinearRegression,This is an example,0.335254,0.023981,2639.683433,23.963857,0.023981
2.pickle,LinearRegression,RFE to 10 features,0.298478,0.02274,2705.838105,24.067241,0.022748
3.pickle,BayesianRidge,all features,0.478924,0.022792,2705.694913,24.058807,0.0228
4.pickle,LinearRegression,added distance and crs_elapsed_time,0.318419,0.02521,2698.999142,23.990768,0.025218
5.pickle,LinearRegression,This is an example,0.176739,0.025575,2211.004225,22.450006,0.025595
6.pickle,LinearRegression,week before. historic average,0.272816,-0.022939,2353.875309,26.238905,-0.004109
7.pickle,LinearRegression,week before. historic average,0.22774,-0.022939,2353.875309,26.238905,-0.004109
