In [1]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling

## Quering the database

In [2]:
example_query =  """
                SELECT * FROM flights
                    LIMIT 10;
                """

example = database.query(example_query)
example.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2019-01-20,DL,DL,DL,959,DL,N992DL,959,10397,ATL,...,526.0,,,,,,,,,


## Fetching predetermined sets of data

In [3]:
train_flights = preprocessing.get_train_flights()
print(train_flights.shape)
train_flights.head(1)

(142951, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
0,2019-01-01,OH,5621,CLT,MLB,1140,1328,108.0,490.0,0.0


In [4]:
test_flights = preprocessing.get_test_flights()
print(test_flights.shape)
test_flights.head(1)

(150623, 9)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
0,2020-01-01,WN,5888,ONT,SFO,1810,1945,95,363


## Adding features to table with preparation script

In [5]:
print(f'before: {train_flights.shape}')
train_flights = preparation.build_all_features(train_flights, sparse=True)
print(f'after: {train_flights.shape}')
train_flights.head(1)

before: (142951, 10)
after: (142951, 34)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,...,origin_cold,dest_cold,origin_storm,dest_storm,origin_precipitation,dest_precipitation,origin_snow,dest_snow,origin_hail,dest_hail
0,2019-01-01,OH,5621,CLT,MLB,1140,1328,108.0,490.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Splitting the data with size and random seed held in script

In [6]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 33)
(42886, 33)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,fl_num_avg_arr_delay,...,origin_cold,dest_cold,origin_storm,dest_storm,origin_precipitation,dest_precipitation,origin_snow,dest_snow,origin_hail,dest_hail
85279,2019-01-07,G4,35,RNO,LAS,2038,2152,74.0,345.0,6.318471,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


## Standardizing the data with preparation script

In [7]:
from sklearn.preprocessing import StandardScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val], scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 32)
(42886, 32)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_carrier_delay,fl_num_avg_taxi_out,fl_num_avg_late_aircraft_delay,distance,...,dest_rain,dest_snow,dest_storm,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
85279,-0.526541,0.85015,-0.769231,0.63897,0.232279,0.158947,1.076211,-0.659896,-0.880293,-0.752995,...,-0.694954,-0.279221,-0.083691,1,0,0,0,0,0,0


## Using the modeling script to run a test, automatically saving the scores and model to local drive

In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
notes = "This is an example (sparse features)"
modeling.run_test(X_train.fillna(0),X_val.fillna(0),y_train.fillna(0),y_val.fillna(0), model, notes)

{'r2_score': 0.04023781317651709, 'mean_squared_error': 1969.3717820181662, 'mean_absolute_error': 22.161662384568267, 'explained_variance_score': 0.040238381212149066}


LinearRegression()

## Retrieving the records of past modeling tests

In [9]:
modeling.get_records().tail(1)

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score
22.pickle,LinearRegression,This is an example (sparse features),0.116148,0.040238,1969.371782,22.161662,0.040238


## Retrieving saved model

In [13]:
saved_model = modeling.get_pickle(1)