In [3]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling
import pandas as pd

In [4]:
PRIMARY_TEST_FEATURES = """ fl_date, 
                        op_unique_carrier, 
                        op_carrier_fl_num, 
                        origin, 
                        dest, 
                        crs_dep_time,
                        crs_arr_time, 
                        crs_elapsed_time, 
                        distance
                    """
PRIMARY_FEATURES =  PRIMARY_TEST_FEATURES+',arr_delay'

def get_train_flights(features:str=PRIMARY_FEATURES) -> pd.DataFrame:
    """ 
    Returns DataFrame of all flights from first week of January 2019
    
    Accepts an optional argument for specific features to query in string format
    
        Example: get_train_flights("fl_date,tail_num,distance") 
    """
    flights = database.query(f"""SELECT {features}
                             FROM flights
                                WHERE fl_date = ANY('{{2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01-05, 2019-01-06, 2019-01-07
                                                       2018-01-01, 2018-01-02, 2018-01-03, 2018-01-04, 2018-01-05, 2018-01-06, 2018-01-07     }}')
                             """)
    flight_numbers = pd.read_csv('../data/preprocessing/test_flight_numbers.csv')  
    #apply filters
    flights = flights[flights['op_carrier_fl_num'].isin(flight_numbers['op_carrier_fl_num'].values)]
    flights = flights[flights['arr_delay'].notnull()]
    return  flights

In [5]:
train_flights = get_train_flights()
print(train_flights.shape)
train_flights.head(1)

(233582, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
0,2018-01-02,9E,3622,TYS,MSP,728,914,166.0,792.0,-16.0


In [6]:
print(f'before: {train_flights.shape}')
train_flights = preparation.build_all_features(train_flights)
train_flights.head(1)
print(f'after: {train_flights.shape}')

before: (233582, 10)
after: (233582, 40)


In [7]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(163507, 39)
(70075, 39)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,fl_num_avg_dep_delay,...,day_of_year,day_of_week,Severity,Rain,Fog,Cold,Storm,Precipitation,Snow,Hail
146315,2018-01-03,DL,2760,MSY,ATL,700,930,90.0,425.0,10.648708,...,3,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.preprocessing import StandardScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val], scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(163507, 43)
(70075, 43)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,Cold,Fog,Hail,Precipitation,Rain,Snow,...,day_of_year_4,day_of_year_5,day_of_year_6,day_of_year_7,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
146315,0.607389,-0.794404,0.965595,-0.260049,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
notes = "2018+2019 data"
modeling.run_test(X_train.fillna(0),X_val.fillna(0),y_train,y_val, model, notes)

{'r2_score': 0.023586426943261407, 'mean_squared_error': 2757.5813926722662, 'mean_absolute_error': 25.053316987179393, 'explained_variance_score': 0.0235983852369388}


LinearRegression()