In [38]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling
import pandas as pd
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

In [39]:
NUMERIC_FEATURES = [    "fl_num_avg_arr_delay",
                        "fl_num_avg_dep_delay",
                        # "fl_num_avg_carrier_delay",
                        # "fl_num_avg_weather_delay",
                        # "fl_num_avg_nas_delay",
                        # "fl_num_avg_security_delay",
                        # "fl_num_avg_taxi_out",
                        # "fl_num_avg_wheels_off", 
                        # "fl_num_avg_wheels_on", 
                        # "fl_num_avg_taxi_in", 
                        # "fl_num_avg_crs_elapsed_time",
                        # "fl_num_avg_actual_elapsed_time",
                        # "fl_num_avg_air_time",
                        "fl_num_avg_late_aircraft_delay",
                        "fl_num_avg_total_add_gtime",
                        # "fl_num_avg_longest_add_gtime",
                        'Severity', 
                        'distance',
                        'crs_elapsed_time',
                        'origin_cold', 
                        'origin_fog',
                        'origin_hail',
                        'origin_precipitation',
                        'origin_rain',
                        'origin_snow',
                        'origin_storm',
                        'dest_cold', 
                        'dest_fog',
                        'dest_hail',
                        'dest_precipitation',
                        'dest_rain',
                        'dest_snow',
                        'dest_storm', 
                        'tail_num_avg_dep_delay',
                        'tail_num_avg_arr_delay', 
                        'carrier_avg_dep_delay',
                        'carrier_avg_arr_delay', 
                        'dest_avg_dep_delay',
                        'dest_avg_arr_delay', 
                        'origin_avg_dep_delay',
                        'origin_avg_arr_delay', 
                        'carrier_avg_carrier_delay', 
                        'tail_num_avg_taxi_out',
                        'tail_num_avg_taxi_in', 
                        'dest_avg_taxi_out', 
                        'origin_avg_taxi_in'] 

In [40]:
PRIMARY_TEST_FEATURES = """ fl_date, 
                        op_unique_carrier, 
                        op_carrier_fl_num, 
                        origin, 
                        dest, 
                        crs_dep_time,
                        crs_arr_time, 
                        crs_elapsed_time,
                        distance,  
                        tail_num
                    """
PRIMARY_FEATURES =  PRIMARY_TEST_FEATURES+',arr_delay'

def get_train_flights(features:str=PRIMARY_FEATURES) -> pd.DataFrame:
    """ 
    Returns DataFrame of all flights from first week of January 2019
    
    Accepts an optional argument for specific features to query in string format
    
        Example: get_train_flights("fl_date,tail_num,distance") 
    """
    flights = database.query(f"""SELECT {features}
                             FROM flights
                                WHERE fl_date = ANY('{{2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01-05, 2019-01-06, 2019-01-07}}')
                             """)
    flight_numbers = pd.read_csv('../data/preprocessing/test_flight_numbers.csv')  
    #apply filters
    flights = flights[flights['op_carrier_fl_num'].isin(flight_numbers['op_carrier_fl_num'].values)]
    flights = flights[flights['arr_delay'].notnull()]
    return  flights

In [41]:
train_flights = get_train_flights()
train_flights.shape

(142951, 11)

In [42]:
tail = pd.read_csv('../data/preprocessing/averages_by_tail_num.csv')
carrier = pd.read_csv('../data/preprocessing/averages_by_carrier.csv')
dest = pd.read_csv('../data/preprocessing/averages_by_dest.csv')
origin = pd.read_csv('../data/preprocessing/averages_by_origin.csv')

In [43]:
train_flights = pd.merge(train_flights,tail[['tail_num', 
                                             'tail_num_avg_dep_delay',
                                             'tail_num_avg_arr_delay',
                                             'tail_num_avg_taxi_out',
                                             'tail_num_avg_taxi_in']], on='tail_num')
train_flights = pd.merge(train_flights,carrier[['op_unique_carrier', 
                                                'carrier_avg_dep_delay',
                                                'carrier_avg_arr_delay', 
                                                'carrier_avg_carrier_delay']], on='op_unique_carrier')
train_flights = pd.merge(train_flights,dest[['dest',
                                             'dest_avg_taxi_out', 
                                             'dest_avg_dep_delay',
                                             'dest_avg_arr_delay']], on='dest')
train_flights = pd.merge(train_flights,origin[['origin', 
                                               'origin_avg_taxi_in',
                                               'origin_avg_arr_delay']], on='origin')

In [47]:
train_flights.isna().sum().sort_values().tail(5)

dest_avg_taxi_out         0
dest_avg_dep_delay        0
dest_avg_arr_delay        0
tail_num_avg_dep_delay    0
dest_hail                 0
dtype: int64

In [45]:
print(f'before: {train_flights.shape}')
train_flights = preparation.build_all_features(train_flights, True)
train_flights.head(1)
print(f'after: {train_flights.shape}')

before: (142951, 23)
after: (142951, 47)


In [48]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 46)
(42886, 46)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,tail_num,...,origin_cold,dest_cold,origin_storm,dest_storm,origin_precipitation,dest_precipitation,origin_snow,dest_snow,origin_hail,dest_hail
85279,2019-01-05,YX,3462,RDU,EWR,1400,1545,105.0,416.0,N747YX,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
from sklearn.preprocessing import StandardScaler, RobustScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val],numeric_features=NUMERIC_FEATURES,scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 42)
(42886, 42)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_late_aircraft_delay,distance,crs_elapsed_time,origin_cold,...,tail_num_avg_taxi_in,dest_avg_taxi_out,origin_avg_taxi_in,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
85279,-0.833058,-0.553185,-0.502204,-0.864749,-0.033492,-0.347297,-1.252353,-0.636233,-0.514635,-0.069789,...,0.857941,1.584422,0.386518,0,0,0,0,0,1,0


In [50]:
correlations = pd.concat([X_train.drop(features_to_drop,1),y_train],1).corr()['arr_delay'].abs()

import plotly.express as px

fig = px.bar(correlations)
fig.show()
correlations = pd.concat([X_train.drop(features_to_drop,1),y_train],1).corr()['arr_delay'][:-1].abs()

import plotly.express as px

fig = px.bar(correlations)
fig.show()

In [51]:
from sklearn.linear_model import LinearRegression

In [52]:
# features_to_drop = ['origin_precipitation','dest_precipitation', 'origin_avg_taxi_in', 'day_of_week_2', 'day_of_week_3','day_of_week_5','dest_cold','dest_fog','dest_hail','origin_cold','origin_fog','distance','carrier_avg_carrier_delay']

# model = LinearRegression()
# notes = "droping featuresX2, outlier cap -60 / 120"
# modeling.run_test(X_train.drop(features_to_drop,1),X_val.drop(features_to_drop,1),y_train,y_val, model, notes)

In [53]:

model = LinearRegression()
notes = "outlier cap -60 / 120"
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

{'r2_score': 0.05269996017417977, 'mean_squared_error': 2093.530405696511, 'mean_absolute_error': 21.889875827095903, 'explained_variance_score': 0.05273824873950694}


LinearRegression()