In [1]:
import sys
sys.path.append("..")
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling

In [2]:
### Get training data
train_flights = preprocessing.get_train_flights()
train_flights.shape


(287151, 11)

In [3]:
### Dates of samples
train_flights['fl_date'].unique()

array(['2019-01-06', '2019-01-07', '2019-12-25', '2019-12-26',
       '2019-12-27', '2019-12-28', '2019-12-29', '2019-12-30',
       '2019-12-31', '2019-01-01', '2019-01-02', '2019-01-03',
       '2019-01-04', '2019-01-05'], dtype=object)

In [4]:
### Get testing data
test_flights = preprocessing.get_test_flights()
X_test = test_flights.copy()
### Convert to string format to make it in line with format of training data
X_test['fl_date'] = X_test['fl_date'].astype(str)
X_test.shape


(150623, 10)

In [5]:
### Build features
train_flights = preparation.build_all_features(train_flights)
X_test = preparation.build_all_features(X_test)
print(f'training data: {train_flights.shape}')
print(f'testing data: {X_test.shape}')

training data: (287151, 47)
testing data: (150623, 46)


In [6]:
### Split data for validation set
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)


(201005, 46)
(201005,)
(86146, 46)
(86146,)


In [7]:
### Standardize data
from sklearn.preprocessing import RobustScaler
X_train, X_val, X_test = preparation.standardize_data(data_arr=[X_train, X_val, X_test],scaler=RobustScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(201005, 42)
(86146, 42)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_late_aircraft_delay,fl_num_avg_taxi_out,tail_num_avg_arr_delay,tail_num_avg_dep_delay,...,dest_rain,dest_snow,dest_storm,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
168275,-0.954633,-0.297784,-0.77616,-0.630536,0.090711,0.098012,0.16508,-0.063377,0.023026,0.015113,...,0.0,3.0,0.0,0,1,0,0,0,0,0


# Baseline Linear Regression test

In [8]:
from sklearn.linear_model import LinearRegression
l_reg = LinearRegression()
notes = "baseline"
modeling.run_test(X_train,X_val,y_train,y_val, l_reg, notes)

{'r2_score': 0.09315781081874674, 'mean_squared_error': 705.8619975484231, 'mean_absolute_error': 18.030044347694464, 'explained_variance_score': 0.09315836670168276}


LinearRegression()

In [9]:
from sklearn.ensemble import RandomForestRegressor
r_forest = RandomForestRegressor(n_estimators=180, max_depth=18, min_samples_leaf=8, random_state=42)
notes = "validation"
modeling.run_test(X_train,X_val,y_train,y_val, r_forest, notes)
modeling.get_records().tail(2)

{'r2_score': 0.1847767133055207, 'mean_squared_error': 634.5482648019391, 'mean_absolute_error': 16.87382699241848, 'explained_variance_score': 0.18479245176426684}


Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score,f1_score,recall_score,precision_score,accuracy_score
146.pickle,LinearRegression,baseline,0.365907,0.093158,705.861998,18.030044,0.093158,,,,
147.pickle,RandomForestRegressor,validation,446.767734,0.184777,634.548265,16.873827,0.184792,,,,


In [10]:
predictions = r_forest.predict(X_test)

test_flights['arr_delay'] = predictions
test_flights.to_csv('../final_predictions.csv')