In [8]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling
import pandas as pd
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

In [9]:
NUMERIC_FEATURES = [    "fl_num_avg_arr_delay",
                        "fl_num_avg_dep_delay",
                        # "fl_num_avg_carrier_delay",
                        # "fl_num_avg_weather_delay",
                        # "fl_num_avg_nas_delay",
                        # "fl_num_avg_security_delay",
                        # "fl_num_avg_taxi_out",
                        # "fl_num_avg_wheels_off", 
                        # "fl_num_avg_wheels_on", 
                        # "fl_num_avg_taxi_in", 
                        # "fl_num_avg_crs_elapsed_time",
                        # "fl_num_avg_actual_elapsed_time",
                        # "fl_num_avg_air_time",
                        "fl_num_avg_late_aircraft_delay",
                        "fl_num_avg_total_add_gtime",
                        # "fl_num_avg_longest_add_gtime",
                        'Severity', 
                        'distance',
                        'crs_elapsed_time',
                        'origin_cold', 
                        'origin_fog',
                        'origin_hail',
                        'origin_precipitation',
                        'origin_rain',
                        'origin_snow',
                        'origin_storm',
                        'dest_cold', 
                        'dest_fog',
                        'dest_hail',
                        'dest_precipitation',
                        'dest_rain',
                        'dest_snow',
                        'dest_storm', 
                        'tail_num_avg_dep_delay',
                        'tail_num_avg_arr_delay', 
                        'carrier_avg_dep_delay',
                        'carrier_avg_arr_delay', 
                        'dest_avg_dep_delay',
                        'dest_avg_arr_delay', 
                        'origin_avg_dep_delay',
                        'origin_avg_arr_delay', 
                        'carrier_avg_carrier_delay', 
                        'tail_num_avg_taxi_out',
                        'tail_num_avg_taxi_in', 
                        'dest_avg_taxi_out', 
                        'origin_avg_taxi_in'] 

In [30]:
PRIMARY_TEST_FEATURES = """ fl_date, 
                        op_unique_carrier, 
                        op_carrier_fl_num, 
                        origin, 
                        dest, 
                        crs_dep_time,
                        crs_arr_time, 
                        crs_elapsed_time,
                        distance,  
                        tail_num
                    """
PRIMARY_FEATURES =  PRIMARY_TEST_FEATURES+',arr_delay'

def get_train_flights(features:str=PRIMARY_FEATURES) -> pd.DataFrame:
    """ 
    Returns DataFrame of all flights from first week of January 2019
    
    Accepts an optional argument for specific features to query in string format
    
        Example: get_train_flights("fl_date,tail_num,distance") 
    """
    flights = database.query(f"""SELECT {features}
                             FROM flights
                                WHERE fl_date = ANY('{{2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01-05, 2019-01-06, 2019-01-07}}')
                                AND arr_delay >= -30
                                AND arr_delay <= 120
                             """)
    flight_numbers = pd.read_csv('../data/preprocessing/test_flight_numbers.csv')  
    #apply filters
    flights = flights[flights['op_carrier_fl_num'].isin(flight_numbers['op_carrier_fl_num'].values)]
    flights = flights[flights['arr_delay'].notnull()]
    return  flights

In [31]:
train_flights = get_train_flights()
train_flights.shape

(133530, 11)

In [32]:
tail = pd.read_csv('../data/preprocessing/averages_by_tail_num.csv')
carrier = pd.read_csv('../data/preprocessing/averages_by_carrier.csv')
dest = pd.read_csv('../data/preprocessing/averages_by_dest.csv')
origin = pd.read_csv('../data/preprocessing/averages_by_origin.csv')

In [33]:
train_flights = pd.merge(train_flights,tail[['tail_num', 
                                             'tail_num_avg_dep_delay',
                                             'tail_num_avg_arr_delay',
                                             'tail_num_avg_taxi_out',
                                             'tail_num_avg_taxi_in']], on='tail_num')
train_flights = pd.merge(train_flights,carrier[['op_unique_carrier', 
                                                'carrier_avg_dep_delay',
                                                'carrier_avg_arr_delay', 
                                                'carrier_avg_carrier_delay']], on='op_unique_carrier')
train_flights = pd.merge(train_flights,dest[['dest',
                                             'dest_avg_taxi_out', 
                                             'dest_avg_dep_delay',
                                             'dest_avg_arr_delay']], on='dest')
train_flights = pd.merge(train_flights,origin[['origin', 
                                               'origin_avg_taxi_in',
                                               'origin_avg_arr_delay']], on='origin')

In [34]:
train_flights.isna().sum().sort_values().tail(5)

origin                  0
op_carrier_fl_num       0
op_unique_carrier       0
arr_delay               0
origin_avg_arr_delay    0
dtype: int64

In [35]:
print(f'before: {train_flights.shape}')
train_flights = preparation.build_all_features(train_flights, True)
train_flights.head(1)
print(f'after: {train_flights.shape}')

before: (133530, 23)
after: (133530, 47)


In [36]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(93471, 46)
(40059, 46)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,tail_num,...,origin_cold,dest_cold,origin_storm,dest_storm,origin_precipitation,dest_precipitation,origin_snow,dest_snow,origin_hail,dest_hail
98469,2019-01-06,AA,985,DCA,ORD,1240,1357,137.0,612.0,N861NN,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
from sklearn.preprocessing import StandardScaler, RobustScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val],numeric_features=NUMERIC_FEATURES,scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(93471, 42)
(40059, 42)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_late_aircraft_delay,distance,crs_elapsed_time,origin_cold,...,tail_num_avg_taxi_in,dest_avg_taxi_out,origin_avg_taxi_in,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
98469,-0.490834,-0.871253,-0.175917,-0.984405,-1.322549,-1.997529,-2.123662,-0.286988,-0.045148,-0.0667,...,1.532598,0.212241,-0.129082,0,0,0,0,0,0,1


In [38]:
correlations = pd.concat([X_train,y_train],1).corr()['arr_delay'].abs()

import plotly.express as px

fig = px.bar(correlations)
fig.show()
correlations = pd.concat([X_train,y_train],1).corr()['arr_delay'][:-1].abs()

import plotly.express as px

fig = px.bar(correlations)
fig.show()

In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
# features_to_drop = ['origin_precipitation','dest_precipitation', 'origin_avg_taxi_in', 'day_of_week_2', 'day_of_week_3','day_of_week_5','dest_cold','dest_fog','dest_hail','origin_cold','origin_fog','distance','carrier_avg_carrier_delay']

# model = LinearRegression()
# notes = "droping featuresX2, outlier cap -60 / 120"
# modeling.run_test(X_train.drop(features_to_drop,1),X_val.drop(features_to_drop,1),y_train,y_val, model, notes)

In [42]:

model = LinearRegression()
notes = "sample cap -30 / 120"
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

{'r2_score': 0.08827590468555258, 'mean_squared_error': 518.8688630038184, 'mean_absolute_error': 15.927184922597364, 'explained_variance_score': 0.08827872224219802}


LinearRegression()

In [43]:
modeling.get_records().tail(1)

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score,f1_score,recall_score,precision_score,accuracy_score
60.pickle,LinearRegression,sample cap -30 / 120,0.181362,0.088276,518.868863,15.927185,0.088279,,,,


In [45]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=1000, max_depth=15)
notes = "n_estimators=1000, max_depth=15, sample cap -30 / 120"
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

In [29]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(max_depth=10)
notes = "max_depth=10"
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

{'r2_score': 0.06447552817986302, 'mean_squared_error': 2143.2437650227025, 'mean_absolute_error': 20.792071333079168, 'explained_variance_score': 0.06462819794936847}


GradientBoostingRegressor(max_depth=10)

In [27]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor(n_estimators=2)
notes = "n_estimators=2"
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

{'r2_score': 0.02464477967496448, 'mean_squared_error': 2234.494187604618, 'mean_absolute_error': 22.316532147653252, 'explained_variance_score': 0.02465666355309004}


AdaBoostRegressor(n_estimators=2)

In [25]:
modeling.get_records().tail()

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score,f1_score,recall_score,precision_score,accuracy_score
51.pickle,LinearRegression,"droping featuresX2, outlier cap -60 / 120",0.099338,0.045554,2216.953649,22.072634,0.045577,,,,
52.pickle,LinearRegression,outlier cap -60 / 120,0.192385,0.0527,2093.530406,21.889876,0.052738,,,,
53.pickle,RandomForestRegressor,"n_estimators=1000, max_depth=10",689.167541,0.072056,2125.878173,21.145757,0.07206,,,,
54.pickle,AdaBoostRegressor,,7.627081,-3.477627,10258.038884,88.445036,-0.526817,,,,
55.pickle,AdaBoostRegressor,n_estimators=100,9.211415,-7.299339,19013.40614,123.171368,-1.148521,,,,
