In [1]:
import sys
sys.path.append("..")  # <-  This should point to the root directory of the project relative to this file

from custom_scripts import database
from custom_scripts import preprocessing
from custom_scripts import preparation
from custom_scripts import modeling

## Quering the database

In [2]:
example_query =  """
                SELECT * FROM flights
                    LIMIT 10;
                """

example = database.query(example_query)
example.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-08-10,UA,UA_CODESHARE,UA,5814,OO,N127SY,5814,13930,ORD,...,1437.0,,,,,,,,,


## Fetching predetermined sets of data

In [3]:
train_flights = preprocessing.get_train_flights()
print(train_flights.shape)
train_flights.head(1)

(142951, 10)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay
2,2019-01-01,OO,3368,SJC,AUS,1805,2337,212.0,1476.0,9.0


In [4]:
test_flights = preprocessing.get_test_flights()
print(test_flights.shape)
test_flights.head(1)

(150623, 9)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
0,2020-01-01,WN,5888,ONT,SFO,1810,1945,95,363


## Adding features to table with preparation script

In [5]:
train_flights = preparation.build_all_features(train_flights)
train_flights.head(1)

Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,...,fl_num_avg_security_delay,fl_num_avg_late_aircraft_delay,fl_num_avg_total_add_gtime,fl_num_avg_longest_add_gtime,arr_time_sin,dep_time_sin,arr_time_cos,dep_time_cos,day_of_year,day_of_week
0,2019-01-01,OO,3368,SJC,AUS,1805,2337,212.0,1476.0,9.0,...,0.0,27.428571,49.35,48.6,-0.095912,-0.999685,0.99539,0.025104,1,1


## Splitting the data with size and random seed held in script

In [6]:
X = train_flights.drop('arr_delay',1)
y = train_flights['arr_delay']
X_train, X_val, y_train, y_val = preparation.get_train_test_split(X, y)
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 31)
(42886, 31)


Unnamed: 0,fl_date,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,fl_num_avg_dep_delay,...,fl_num_avg_security_delay,fl_num_avg_late_aircraft_delay,fl_num_avg_total_add_gtime,fl_num_avg_longest_add_gtime,arr_time_sin,dep_time_sin,arr_time_cos,dep_time_cos,day_of_year,day_of_week
85279,2019-01-05,AS,49,ANC,ADQ,1512,1615,63.0,253.0,8.893752,...,0.057301,21.51756,35.235294,27.764706,-0.898176,-0.744992,-0.439635,-0.667073,5,5


## Standardizing the data with preparation script

In [7]:
from sklearn.preprocessing import StandardScaler
X_train, X_val = preparation.standardize_data(data_arr=[X_train, X_val], scaler=StandardScaler())
print(X_train.shape)
print(X_val.shape)
X_train.head(1)

(100065, 34)
(42886, 34)


Unnamed: 0,arr_time_sin,arr_time_cos,dep_time_sin,dep_time_cos,fl_num_avg_arr_delay,fl_num_avg_dep_delay,fl_num_avg_carrier_delay,fl_num_avg_weather_delay,fl_num_avg_nas_delay,fl_num_avg_security_delay,...,day_of_year_5,day_of_year_6,day_of_year_7,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
85279,-0.898176,-0.439635,-0.744992,-0.667073,-0.409822,-0.527798,0.031745,-0.402404,-1.711546,-0.191165,...,1,0,0,0,0,0,0,0,1,0


## Using the modeling script to run a test, automatically saving the scores and model to local drive

In [10]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
notes = "This is an example"
modeling.run_test(X_train.fillna(0),X_val.fillna(0),y_train.fillna(0),y_val.fillna(0), model, notes)

{'r2_score': 0.01654802494439611, 'mean_squared_error': 2096.992181188898, 'mean_absolute_error': 22.510753451114816, 'explained_variance_score': 0.016631645903316006}


LinearRegression()

## Retrieving the records of past modeling tests

In [12]:
modeling.get_records()

Unnamed: 0,model,notes,training_time,r2_score,mean_squared_error,mean_absolute_error,explained_variance_score
1.pickle,LinearRegression,This is an example,0.043895,-2e-06,2523.296611,24.625006,7e-06


## Retrieving saved model

In [13]:
saved_model = modeling.get_pickle(1)