In [1]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling

## Quering the database

In [2]:
example_query =  """
                SELECT * FROM flights
                    LIMIT 10;
                """

example = database.query(example_query)
example.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2019-01-01,WN,WN,WN,20,WN,N766SW,20,11292,DEN,...,977.0,12.0,0.0,0.0,0.0,26.0,,,,


## Fetching predetermined sets of data

In [3]:
train_flights = preprocessing.get_train_flights()
print(train_flights.shape)
train_flights.head(1)

(142951, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
0,2019-01-01,WN,20,DEN,IND,1605,2015,130.0,977.0,38.0


In [4]:
test_flights = preprocessing.get_test_flights()
print(test_flights.shape)
test_flights.head(1)

(150623, 9)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
0,2020-01-01,WN,5888,ONT,SFO,1810,1945,95,363


## Adding features to table with preparation script

In [5]:
train_flights = preparation.build_all_features(train_flights)
train_flights.head(1)

Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,...,day_of_year,day_of_week,Severity,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,Type_Storm
0,2019-01-01,WN,20,DEN,IND,1605,2015,130.0,977.0,38.0,...,1,1,,,,,,0.0,,


## Splitting the data with size and random seed held in script

In [6]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(125822, 39)
(53924, 39)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,fl_num_avg_dep_delay,...,day_of_year,day_of_week,Severity,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,Type_Storm
10969,2019-01-03,WN,2173,MEM,BWI,1135,1435,120.0,787.0,14.386753,...,3,3,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Standardizing the data with preparation script

In [7]:
from sklearn.preprocessing import StandardScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val], scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(125822, 42)
(53924, 42)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,Type_Cold,Type_Fog,Type_Hail,Type_Precipitation,Type_Rain,Type_Snow,...,day_of_year_5,day_of_year_6,day_of_year_7,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
10969,-0.62799,-0.778221,0.106772,-0.994284,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,0,0,0


## Using the modeling script to run a test, automatically saving the scores and model to local drive

In [8]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
notes = "This is an example"
modeling.run_test(X_train.fillna(0),X_val.fillna(0),y_train.fillna(0),y_val.fillna(0), model, notes)

{'r2_score': 0.02433307548401864, 'mean_squared_error': 2639.672323677474, 'mean_absolute_error': 24.140363547166913, 'explained_variance_score': 0.024381386419689566}


LinearRegression()

## Retrieving the records of past modeling tests

In [9]:
modeling.get_records()

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score
1.pickle,LinearRegression,This is an example,0.043895,-2e-06,2523.296611,24.625006,7e-06
2.pickle,LinearRegression,This is an example,0.167801,0.016548,2096.992181,22.510753,0.016632
3.pickle,LinearRegression,This is an example,0.260602,0.017014,2721.693239,24.14721,0.017021
4.pickle,LinearRegression,This is an example,0.291312,0.025778,2763.042701,24.006897,0.025856


## Retrieving saved model

In [13]:
saved_model = modeling.get_pickle(1)