# 4. Experimentation with FLAML

Additionally, [FLAML](https://github.com/microsoft/FLAML), a lightweight tool for retrieving parameters for models was also explored. However, upon use the parameters are somewhat unreliable given the forecast period and the corresponding RMSE results. Code blocks were commented out for clarity of the notebook and more exploration was done across other tests but tweaked parameters still performed better than parameters provided by FLAML. Although it may be good for other tasks, the time restrictions for the given exploration makes it difficult to explore this further.

In [1]:
!pip install gdown numpy pandas sklearn matplotlib lightgbm reverse_geocoder folium selenium flaml pystan==2.19.1.1

Collecting reverse_geocoder
  Downloading reverse_geocoder-1.5.1.tar.gz (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 5.0 MB/s 
Collecting selenium
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[K     |████████████████████████████████| 904 kB 43.8 MB/s 
[?25hCollecting flaml
  Downloading FLAML-0.6.3-py3-none-any.whl (155 kB)
[K     |████████████████████████████████| 155 kB 58.5 MB/s 
Collecting catboost>=0.23
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 25 kB/s 
Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.3 MB/s 
[?25hCollecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 18.0 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Building wh

In [2]:
!pip install prophet

Collecting prophet
  Downloading prophet-1.0.1.tar.gz (65 kB)
[K     |████████████████████████████████| 65 kB 2.2 MB/s 
Collecting cmdstanpy==0.9.68
  Downloading cmdstanpy-0.9.68-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 4.6 MB/s 
Collecting ujson
  Downloading ujson-4.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (178 kB)
[K     |████████████████████████████████| 178 kB 10.8 MB/s 
Building wheels for collected packages: prophet
  Building wheel for prophet (setup.py) ... [?25l[?25hdone
  Created wheel for prophet: filename=prophet-1.0.1-py3-none-any.whl size=6640907 sha256=3c2bc8258f4de60e7581b06d7235896bcd80a6108142637d220566a1de27213d
  Stored in directory: /root/.cache/pip/wheels/4e/a0/1a/02c9ec9e3e9de6bdbb3d769d11992a6926889d71567d6b9b67
Successfully built prophet
Installing collected packages: ujson, cmdstanpy, prophet
  Attempting uninstall: cmdstanpy
    Found existing installation: cmdstanpy 0.9.5
    Uninstalling cmdstanpy-0.9.

In [3]:
import datetime as dt
import array

import gdown
import reverse_geocoder
from tqdm.notebook import tqdm
import os
import pandas as pd
import numpy as np
from IPython.display import display
import ipywidgets as widgets
import pickle
import math

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
filenames = {
    "TRAFFIC_DATA" : "dot_traffic_2015.txt.gz",
    "TRAFFIC_STATIONS" : "dot_traffic_stations_2015.txt.gz",
    "FIPS_CODE" : "fips_code.csv"
}

IS_FROM_GDRIVE = True

urls = {
    "TRAFFIC_DATA" : "https://drive.google.com/file/d/18I43wccnq-e0bo238oarO4a-8C6xHGvT/view?usp=sharing",
    "TRAFFIC_STATIONS" : "https://drive.google.com/file/d/1xGOS1qL-K7mmqgTGEoQPRI2j7IpSRYX_/view?usp=sharing",
    "FIPS_CODE" : "https://drive.google.com/file/d/1se0opJXSO90W8nesCVCDqJ7k6i9f_WEu/view?usp=sharing"
}

In [6]:
def create_folder(DIR):
    if not os.path.isdir(DIR):
        os.makedirs(DIR)
        print(f"Created folder '{DIR}' .")
    else:
        print(f"Folder '{DIR}' exists.")

def gdrive_download(url, filename):
    url_id = url.split("/")[5]
    dl_url = f"https://drive.google.com/uc?id={url_id}"
    gdown.download(dl_url, filename, quiet=False)

def load_txtgz(DATA_LOCATION, FILE):
    df = pd.read_csv(os.path.join(DATA_LOCATION,FILE),
                    header=0,
                    sep=',',
                    quotechar='"')

    return df

def load_traffic_datasets(DATA_LOCATION, TRAFFIC_DATA_FILE, 
                          TRAFFIC_STATIONS_FILE):
    """
        Loads data contained in the files to DataFrames
    """
    print(f"Loading traffic data from '{TRAFFIC_DATA_FILE}' ...")
    traffic_data = load_txtgz(DATA_LOCATION, TRAFFIC_DATA_FILE)

    print(f"Loading traffic stations from '{TRAFFIC_STATIONS_FILE}' ...")
    traffic_stations = load_txtgz(DATA_LOCATION, TRAFFIC_STATIONS_FILE)

    print("Finished loading data.")
    return traffic_data, traffic_stations

def create_fips_ref(fips_df):
    """
        Sets all the state names in the fips_df to lowercase for easier 
        comparisons and makes a dict reference with the FIPS code as 
        the key to shorten code instead of having to match/query the 
        DataFrame repeatedly.
    """
    fips_state_ref = dict(zip(fips_df["fips_code"],
                            [x.lower() for x in fips_df["state_name"]]))

    return fips_state_ref

In [7]:
common_cols = ["direction_of_travel",
               "fips_state_code",
               "functional_classification",
               "lane_of_travel",
               "station_id"]
temporal_cols = ["date", "day_of_data",
                "day_of_week", "month_of_data",
                "year_of_data"]
spatial_cols = ["fips_county_code", "latitude", "longitude"]
new_traffic_vol_cols = [str(x) for x in range(0, 24)]
subdf_cols = temporal_cols + new_traffic_vol_cols + common_cols
historical_vol_cols = ["date"] + new_traffic_vol_cols


def get_new_vol_cols(traffic_data):
    traffic_vol_cols = [word for word in traffic_data.columns if 'traffic_volume_counted' in word]
    traffic_data.rename(columns=dict(zip(traffic_vol_cols,
                                     new_traffic_vol_cols)), inplace=True)
    
    return traffic_data


def modify_temporal_cols(traffic_data):
    traffic_data["date"] = pd.to_datetime(traffic_data["date"], format='%Y-%m-%d')
    traffic_data = get_new_vol_cols(traffic_data)
    traffic_data.loc[traffic_data["day_of_week"] == 1, "day_of_week"] = 8
    traffic_data["day_of_week"] -= 2

    return traffic_data


def get_filtered_df(traffic_data, traffic_stations, fips_state_code, save_dir, overwrite=False):
    file_path = f"{os.path.join(processed_dir, str(fips_state_code))}.pkl"

    if overwrite == False and os.path.isfile(file_path):
        print(f"File already exists. Retrieving DataFrame from '{file_path}'.")
        with open(file_path, 'rb') as f:
            df = pickle.load(f)
        return df

    subdf_cols = temporal_cols + new_traffic_vol_cols + common_cols
    subdf = traffic_data[traffic_data["fips_state_code"] == fips_state_code][subdf_cols]

    df = pd.merge(subdf, traffic_stations[traffic_stations["fips_state_code"] == fips_state_code]
                                      [common_cols + spatial_cols], on=common_cols)
    df["day_vol"] = df[new_traffic_vol_cols].sum(axis=1).values

    print(f"Saving DataFrame to '{file_path}'.")
    with open(file_path, 'wb') as f:
        pickle.dump(df, f)

    return df

In [25]:
def save_df_feather(df, dir, filename, verbose=True):
    file_path = os.path.join(dir, f"{filename}.fea")
    df.to_feather(file_path)
    if verbose:
        print(f"Saved file {file_path}")


def read_df_feather(dir, filename):
    file_path = os.path.join(dir, f"{filename}.fea")
    df = pd.read_feather(file_path, use_threads=True)
    return df

In [26]:
def get_transformed_vol_df(subdf, station_id, dir=None, verbose=True, overwrite=False):
    file_path = f"{os.path.join(dir, station_id)}.fea"

    if overwrite == False and os.path.isfile(file_path):
        if verbose:
            print(f"File already exists. Retrieving DataFrame from '{file_path}'.")
        subdf = read_df_feather(dir, filename=station_id)
        return subdf

    col_timestamp = "date"
    col_trafficvol = "traffic_volume"

    station_df = sub_df[sub_df["station_id"] == station_id][historical_vol_cols]
    sum_station_df = pd.DataFrame()
    dates = list(station_df[col_timestamp].unique())

    if verbose:
        print(f"Calculating total hourly volume collected per timestamp from station {station_id}.")
        iter = tqdm(dates)
    else:
        iter = dates
    for date in iter:
        date_condition = station_df[col_timestamp] == date
        sum_df = station_df[date_condition].sum()
        sum_df[col_timestamp] = date

        sum_station_df = sum_station_df.append(sum_df, ignore_index = True)

    row_idxs = range(0, sum_station_df.shape[0])
    if verbose:
        print(f"Transforming DataFrame with hourly volume rows.")
        iter = tqdm(row_idxs)
    else:
        iter = row_idxs
    dates = sum_station_df[col_timestamp].to_list()
    hourly_volumes = sum_station_df[new_traffic_vol_cols].to_numpy()

    all_volumes = []
    timestamps = []

    hour_delta = [np.timedelta64(hour, 'h') for hour in range(0,24)]

    for row_cnt in iter:
        sub_timestamps = [dates[row_cnt] + hour for hour in hour_delta]
        sub_vols = list(hourly_volumes[row_cnt])

        timestamps += sub_timestamps
        all_volumes += sub_vols

    processed_df = pd.DataFrame()
    processed_df[col_timestamp] = timestamps
    processed_df[col_trafficvol] = all_volumes
    processed_df = processed_df.sort_values(by=[col_timestamp]).reset_index(drop=True)

    if dir != None:
        save_df_feather(df=processed_df,
                        dir=dir,
                        filename=station_id,
                        verbose=verbose)

    return processed_df


In [34]:
def get_dataset_splits(df, test_count=61, datetime_unit="D", ratio_split=False, temporal_split=True):
    """
    test_count : 61 days for November (30 days) and December (31 days)
    datetime_unit : "D" to indicate days
    """

    col_timestamp = "date"

    val_ratio = .15
    test_ratio = .15
    train_ratio = 1 - (test_ratio + val_ratio)
    
    temporal_limit = df[col_timestamp].max() - np.timedelta64(test_count, datetime_unit)

    if temporal_split:
        train_df = df[df[col_timestamp] <= temporal_limit]
        val_df = None
        test_df = df[df[col_timestamp] > temporal_limit]
    elif ratio_split:
        train_range = int(df.shape[0]*train_ratio)
        val_range = int(df.shape[0]*val_ratio)

        train_df = df.iloc[0:train_range]
        val_df = df.iloc[train_range:train_range+val_range]
        test_df = df.iloc[train_range+val_range:]
        
    return train_df, val_df, test_df

def get_sliding_windows(array, max_time, sub_window_size, stride_size):
    sub_windows = (
        np.expand_dims(np.arange(sub_window_size), 0) +
        np.expand_dims(np.arange(max_time + 1, step=stride_size), 0).T
    )

    array = array[sub_windows]
    X_values = array[:-1, :]
    
    # Assumes first column is for traffic volumes
    y_values = array[1:, -stride_size:,0]

    return X_values, y_values

In [21]:
DATA_LOCATION = os.getcwd()

In [12]:
for name in urls:
    gdrive_download(urls[name], filenames[name])

Downloading...
From: https://drive.google.com/uc?id=18I43wccnq-e0bo238oarO4a-8C6xHGvT
To: /content/dot_traffic_2015.txt.gz
465MB [00:03, 118MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xGOS1qL-K7mmqgTGEoQPRI2j7IpSRYX_
To: /content/dot_traffic_stations_2015.txt.gz
2.26MB [00:00, 117MB/s]
Downloading...
From: https://drive.google.com/uc?id=1se0opJXSO90W8nesCVCDqJ7k6i9f_WEu
To: /content/fips_code.csv
100%|██████████| 918/918 [00:00<00:00, 976kB/s]


In [13]:
traffic_data, traffic_stations = load_traffic_datasets(DATA_LOCATION,
                                                        filenames["TRAFFIC_DATA"],
                                                        filenames["TRAFFIC_STATIONS"])

Loading traffic data from 'dot_traffic_2015.txt.gz' ...
Loading traffic stations from 'dot_traffic_stations_2015.txt.gz' ...
Finished loading data.


In [14]:
traffic_data = get_new_vol_cols(traffic_data)
traffic_data = modify_temporal_cols(traffic_data)

In [15]:
processed_dir = os.path.join(os.getcwd(), "processed")
create_folder(processed_dir)

Created folder '/content/processed' .


In [16]:
fips_state_code = 6

In [28]:
# verbose = False

# sub_df = get_filtered_df(traffic_data, 
#                         traffic_stations, 
#                         fips_state_code = fips_state_code,
#                         save_dir = processed_dir)
# model_input_dir = os.path.join(processed_dir, str(fips_state_code))
# create_folder(model_input_dir)

# for station_id in tqdm(sub_df["station_id"].unique()):

#     processed_df = get_transformed_vol_df(sub_df,
#                                           station_id,
#                                           dir=model_input_dir,
#                                           verbose=verbose)


Since runtime in colab is relatively slow for this task, we can choose to upload a preprocessed tar file as shown here. It can be dragged and dropped on the side panel where the other files in the colab notebook are.

In [29]:
!tar cvzf 6.tar.gz /content/processed/6

tar: Removing leading `/' from member names
/content/processed/6/
/content/processed/6/075450.fea
/content/processed/6/072130.fea
/content/processed/6/011190.fea
/content/processed/6/011060.fea
/content/processed/6/049040.fea
/content/processed/6/049000.fea
/content/processed/6/018030.fea
/content/processed/6/022140.fea
/content/processed/6/034510.fea
/content/processed/6/035470.fea
/content/processed/6/070270.fea
/content/processed/6/086050.fea
/content/processed/6/075120.fea
/content/processed/6/118240.fea
/content/processed/6/086090.fea
/content/processed/6/035010.fea
/content/processed/6/051480.fea
/content/processed/6/119320.fea
/content/processed/6/126590.fea
/content/processed/6/066090.fea
/content/processed/6/035620.fea
/content/processed/6/116770.fea
/content/processed/6/032320.fea
/content/processed/6/036100.fea
/content/processed/6/034580.fea
/content/processed/6/088730.fea
/content/processed/6/035530.fea
/content/processed/6/011100.fea
/content/processed/6/011420.fea
/conte

In [30]:
# Get test and train splits across 

train_df, val_df, test_df = get_dataset_splits(processed_df)
model_input_dir = os.path.join(processed_dir, str(fips_state_code))
create_folder(model_input_dir)

Folder '/content/processed/6' exists.


In [32]:
train_df

Unnamed: 0,date,traffic_volume
0,2015-01-01 00:00:00,4868.0
1,2015-01-01 01:00:00,3502.0
2,2015-01-01 02:00:00,2526.0
3,2015-01-01 03:00:00,1513.0
4,2015-01-01 04:00:00,1363.0
...,...,...
7267,2015-10-31 19:00:00,6790.0
7268,2015-10-31 20:00:00,6558.0
7269,2015-10-31 21:00:00,6660.0
7270,2015-10-31 22:00:00,5771.0


In [33]:
test_df

Unnamed: 0,date,traffic_volume
7272,2015-11-01 00:00:00,3022.0
7273,2015-11-01 01:00:00,4519.0
7274,2015-11-01 02:00:00,1300.0
7275,2015-11-01 03:00:00,1146.0
7276,2015-11-01 04:00:00,1536.0
...,...,...
8731,2015-12-31 19:00:00,8013.0
8732,2015-12-31 20:00:00,6970.0
8733,2015-12-31 21:00:00,6818.0
8734,2015-12-31 22:00:00,5461.0


In [38]:
import numpy as np
from flaml import AutoML

LOG_DIR = "/content/logs"
create_folder(LOG_DIR)

X_train = train_df["date"].to_numpy()
y_train = train_df["traffic_volume"].to_numpy()
automl = AutoML()

automl.fit(X_train=X_train,
           y_train=y_train,
           metric= 'rmse',
           period=24,  # time to forecast
           task='forecast', 
           time_budget=60*5,  # time budget in seconds
           log_file_name=f"{LOG_DIR}/forecast-{fips_state_code}-{station_id}.log",
           estimator_list=["fbprophet"]
          )

print("Predicting test_set")
y_pred = automl.predict(test_df["date"].to_numpy())

[flaml.automl: 09-13 15:37:57] {1427} INFO - Evaluation method: cv
INFO:flaml.automl:Evaluation method: cv
[flaml.automl: 09-13 15:37:57] {1473} INFO - Minimizing error metric: rmse
INFO:flaml.automl:Minimizing error metric: rmse
[flaml.automl: 09-13 15:37:57] {1505} INFO - List of ML learners in AutoML Run: ['fbprophet']
INFO:flaml.automl:List of ML learners in AutoML Run: ['fbprophet']
[flaml.automl: 09-13 15:37:57] {1735} INFO - iteration 0, current learner fbprophet
INFO:flaml.automl:iteration 0, current learner fbprophet


Folder '/content/logs' exists.


INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
[flaml.automl: 09-13 15:38:15] {1920} INFO -  at 18.9s,	best fbprophet's error=1410.6114,	best fbprophet's error=1410.6114
INFO:flaml.automl: at 18.9s,	best fbprophet's error=1410.6114,	best fbprophet's error=1410.6114
[flaml.automl: 09-13 15:38:15] {1735} INFO - iteration 1, current learner fbprophet
INFO:flaml.automl:iteration 1, current learner fbprophet
INFO:prophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:prophet:Disabling yearl

Predicting test_set


The following code blocks are commented out exploration done involving automl functions.

In [39]:
# input_window_size = 24*7
# output_window_size = 24

# X_train, y_train = get_sliding_windows(train_df[traffic_volume].to_numpy(),
#                                        max_time, sub_window_size, stride_size)

# # Initialize an AutoML instance
# automl = AutoML()
# # Specify automl goal and constraint
# automl_settings = {
#     "time_budget": 60*60,  # in seconds
#     "metric": 'rmse',
#     "task": 'regression',
#     "log_file_name": f"regression-automl-{fips_state_code}-{station_id}.log",
# }

# # Train with labeled input data
# automl.fit(X_train=X_train, y_train=y_train,
#            **automl_settings)


In [40]:
# config = {"n_estimators": 46, "num_leaves": 6, "min_child_samples": 2, 
#           "learning_rate": 0.2149808616613788, "log_max_bin": 9, 
#           "colsample_bytree": 0.7151028186528372, 
#           "reg_alpha": 0.004577823970660193, 
#           "reg_lambda": 0.014498060191184265}, 

# model = XGBRegressor(**config)
# model.fit(X_train, y_train)

In [41]:
# from prophet import Prophet

# m = Prophet(changepoint_prior_scale=0.010000000000000002,
#             seasonality_prior_scale=1.0, 
#             holidays_prior_scale=1.0, 
#             seasonality_mode="multiplicative"           
#             )
# timestamp_col = "date"
# data_col = "traffic_volume"
# p_df = train_df.rename(columns={timestamp_col : "ds",
#                                data_col : "y"})
# m.fit(p_df)

In [42]:
# with open(f"prohet-{fips_state_code}-{station_id}", 'wb') as f:
#     pickle.dump(m, f)

# with open(f"prohet-{fips_state_code}-{station_id}", 'rb') as f:
#     m = pickle.load(f)

In [43]:
# p_test_df = test_df.rename(columns={timestamp_col : "ds"})
# forecast = m.predict(p_test_df)
# forecast