In [1]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling

## Quering the database

In [2]:
example_query =  """
                SELECT * FROM flights
                    LIMIT 10;
                """

example = database.query(example_query)
example.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-08-10,UA,UA_CODESHARE,UA,5814,OO,N127SY,5814,13930,ORD,...,1437.0,,,,,,,,,


## Fetching predetermined sets of data

In [3]:
train_flights = preprocessing.get_train_flights()
print(train_flights.shape)
train_flights.head(1)

(142951, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
2,2019-01-01,OH,5621,CLT,MLB,1140,1328,108.0,490.0,0.0


In [4]:
test_flights = preprocessing.get_test_flights()
print(test_flights.shape)
test_flights.head(1)

(150623, 9)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
0,2020-01-01,WN,5888,ONT,SFO,1810,1945,95,363


## Adding features to table with preparation script

In [5]:
train_flights = preparation.build_all_features(train_flights)
train_flights.head(1)

Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,...,fl_num_avg_weather_delay,fl_num_avg_nas_delay,fl_num_avg_security_delay,fl_num_avg_late_aircraft_delay,fl_num_avg_total_add_gtime,fl_num_avg_longest_add_gtime,arr_time_sin,dep_time_sin,arr_time_cos,dep_time_cos
0,2019-01-01,OH,5621,CLT,MLB,1140,1328,108.0,490.0,0.0,...,2.469388,10.267857,0.0,32.887755,51.411765,51.411765,-0.376877,0.085041,-0.926263,-0.996377


## Splitting the data with size and random seed held in script

In [10]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 29)
(42886, 29)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,fl_num_avg_dep_delay,...,fl_num_avg_weather_delay,fl_num_avg_nas_delay,fl_num_avg_security_delay,fl_num_avg_late_aircraft_delay,fl_num_avg_total_add_gtime,fl_num_avg_longest_add_gtime,arr_time_sin,dep_time_sin,arr_time_cos,dep_time_cos
85279,2019-01-04,AA,35,SAT,DFW,1444,1555,71.0,247.0,11.070847,...,2.489331,11.72973,0.176387,21.529161,21.15625,21.15625,-0.856411,-0.65808,-0.516295,-0.752948


## Standardizing the data with preparation script

In [11]:
from sklearn.preprocessing import StandardScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val], scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 20)
(42886, 20)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_carrier_delay,fl_num_avg_weather_delay,fl_num_avg_nas_delay,fl_num_avg_security_delay,fl_num_avg_taxi_out,fl_num_avg_wheels_off,fl_num_avg_wheels_on,fl_num_avg_taxi_in,fl_num_avg_crs_elapsed_time,fl_num_avg_actual_elapsed_time,fl_num_avg_air_time,fl_num_avg_late_aircraft_delay,fl_num_avg_total_add_gtime,fl_num_avg_longest_add_gtime
85279,-0.856411,-0.516295,-0.65808,-0.752948,0.231877,0.159406,1.076915,-0.528789,-0.897555,0.504979,-0.659053,0.116104,0.680872,0.715255,2.514723,2.573077,2.542862,-0.87582,-1.58649,-1.573904


## Using the modeling script to run a test, automatically saving the scores and model to local drive

In [11]:
import sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
notes = "This is an example"
modeling.run_test(X_train,X_val,y_train,y_val, model, notes)

{'r2_score': -2.1065611590653788e-06, 'mean_squared_error': 2523.296610862557, 'mean_absolute_error': 24.625005634594437, 'explained_variance_score': 6.810394955536481e-06}


LinearRegression()

## Retrieving the records of past modeling tests

In [12]:
modeling.get_records()

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score
1.pickle,LinearRegression,This is an example,0.043895,-2e-06,2523.296611,24.625006,7e-06


## Retrieving saved model

In [13]:
saved_model = modeling.get_pickle(1)