# Data Preparation

**Load Station Metadata**

In [1]:
import pickle
import pandas as pd

In [7]:
with open('../data/loaded/meta.dat', 'rb') as f:
    meta = pickle.load(f)
meta

Unnamed: 0,ID,Fwy,Dir,Latitude,Longitude,Length,Type,Lanes,Abs_PM
0,715898,5,S,33.880183,-118.021787,0.430,ML,3,117.280
1,715900,5,S,33.882892,-118.026822,,OR,1,117.630
2,715901,5,N,33.883400,-118.027451,,OR,1,117.743
3,715903,5,N,33.886992,-118.034125,,OR,1,118.193
4,715904,5,S,33.892489,-118.044573,,OR,1,118.840
...,...,...,...,...,...,...,...,...,...
4878,777066,405,S,34.161152,-118.469631,0.455,ML,3,63.272
4879,777067,405,N,34.161136,-118.469426,0.790,HV,1,63.272
4880,777068,405,S,34.161152,-118.469631,0.790,HV,1,63.272
4881,777168,10,E,34.068993,-118.149756,0.450,HV,1,20.897


**Downsample Stations Based on RDP**

In [3]:
from simplification.cutil import simplify_coords

ds_coords = pd.DataFrame(simplify_coords(meta[["Latitude", 'Longitude']].values, 0.05), columns=['Latitude', 'Longitude'])
ds_coords

Unnamed: 0,Latitude,Longitude
0,33.880183,-118.021787
1,34.111314,-118.264526
2,34.120764,-117.888430
3,34.118291,-118.270166
4,34.358847,-118.554053
...,...,...
687,34.055122,-118.212293
688,33.873704,-118.219282
689,34.024458,-118.239564
690,34.161152,-118.469631


**Compute Pairwise Distances Based on Haversine Distance**

In [26]:
from haversine import haversine
from tqdm.notebook import tqdm
import time
import numpy as np

In [32]:
#initializing progress bar objects
ds_stations = meta.merge(ds_coords, on=['Latitude', 'Longitude'])[['ID', 'Latitude', 'Longitude']].drop_duplicates()
outer_loop=tqdm(ds_stations)
inner_loop=tqdm(ds_stations)
id1 = []
id2 = []
distances = []
for ind1, row1 in ds_stations.iterrows():
    inner_loop.refresh()  #force print final state
    inner_loop.reset()  #reuse bar
    outer_loop.update() #update outer tqdm
    for ind2, row2 in ds_stations.iterrows():
        id1.append(row1['ID'])
        id2.append(row2['ID'])
        dist = haversine((row1['Latitude'], row1['Longitude']), (row2['Latitude'], row2['Longitude']))
        distances.append(dist)
        inner_loop.update() #update inner tqdm


HBox(children=(FloatProgress(value=0.0, max=1428.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1428.0), HTML(value='')))

In [33]:
pairwise_dist = pd.DataFrame({'Station 1': id1, 'Station 2': id2, 'Distance': distances})
pairwise_dist['Station 1'] = pairwise_dist['Station 1'].astype(int)
pairwise_dist['Station 2'] = pairwise_dist['Station 2'].astype(int)
pairwise_dist

Unnamed: 0,Station 1,Station 2,Distance
0,715898,715898,0.000000
1,715898,715910,6.220256
2,715898,759518,6.220256
3,715898,715928,12.980418
4,715898,716895,12.980418
...,...,...,...
2039179,777169,776845,24.049210
2039180,777169,776981,9.641063
2039181,777169,777066,31.149929
2039182,777169,777068,31.149929


In [34]:
# ensure that stations equal to each other have a zero distance
pairwise_dist.loc[pairwise_dist['Station 1'] == pairwise_dist['Station 2'], 'Distance'].value_counts()

0.0    1428
Name: Distance, dtype: int64

**Construct Adjacency Matrix**

In [36]:
## mapper for station to adj_mat indices
station_ind_mapper = {station: ind for ind, station in enumerate(pairwise_dist['Station 1'].sort_values().unique())}
len(station_ind_mapper)

1428

In [37]:
import numpy as np
from tqdm import tqdm

In [38]:
adj_mat = np.zeros((pairwise_dist['Station 1'].nunique(), pairwise_dist['Station 1'].nunique()))
progress = tqdm(range(adj_mat.shape[0]**2*2))
for ind, row in pairwise_dist.iterrows():
    adj_mat[station_ind_mapper[row['Station 1']], station_ind_mapper[row['Station 2']]] = row['Distance']
    progress.update()
    
for ind, row in pairwise_dist.iterrows():
    adj_mat[station_ind_mapper[row['Station 2']], station_ind_mapper[row['Station 1']]] = row['Distance']
    progress.update()

100%|███████████████████████████████▉| 4077132/4078368 [04:04<00:00, 16478.95it/s]

**Load Station Speed Data**

In [51]:
from glob import glob
files = glob('../data/raw_pems/*.gz')
len(files)

365

In [52]:
data_dict = pd.read_csv('../data/raw_pems/station_dict.csv', names=['Feature', 'Description', 'Unit']).iloc[1:].reset_index(drop=True)

**Construct Pivot Tables for Each Day For All DS Stations**

In [53]:
import gzip
dfs = []
for file in tqdm(files):
    with gzip.open(file) as f:
        df = pd.read_csv(f, header=None, usecols=[0, 1, 11])
        df = df[df[1].isin(ds_stations['ID'])]
        pivoted_df = df.pivot_table(values=11, index=0, columns=1)
        pivoted_df.index.name = None
        pivoted_df.columns.name = None
        pivoted_df.index = pd.to_datetime(pivoted_df.index)
        dfs.append(pivoted_df)


  0%|                                                     | 0/365 [00:00<?, ?it/s][A
  0%|                                             | 1/365 [00:02<12:33,  2.07s/it][A
  1%|▏                                            | 2/365 [00:04<12:18,  2.04s/it][A
  1%|▎                                            | 3/365 [00:05<12:06,  2.01s/it][A
  1%|▍                                            | 4/365 [00:07<11:45,  1.95s/it][A
  1%|▌                                            | 5/365 [00:09<11:35,  1.93s/it][A
  2%|▋                                            | 6/365 [00:11<11:39,  1.95s/it][A
  2%|▊                                            | 7/365 [00:13<11:56,  2.00s/it][A
  2%|▉                                            | 8/365 [00:15<11:51,  1.99s/it][A
  2%|█                                            | 9/365 [00:17<12:10,  2.05s/it][A
  3%|█▏                                          | 10/365 [00:19<11:56,  2.02s/it][A
  3%|█▎                                          | 11

 52%|██████████████████████▍                    | 190/365 [06:08<05:27,  1.87s/it][A
 52%|██████████████████████▌                    | 191/365 [06:10<05:25,  1.87s/it][A
 53%|██████████████████████▌                    | 192/365 [06:11<05:23,  1.87s/it][A
 53%|██████████████████████▋                    | 193/365 [06:13<05:19,  1.86s/it][A
 53%|██████████████████████▊                    | 194/365 [06:15<05:13,  1.83s/it][A
 53%|██████████████████████▉                    | 195/365 [06:17<05:20,  1.89s/it][A
 54%|███████████████████████                    | 196/365 [06:19<05:16,  1.88s/it][A
 54%|███████████████████████▏                   | 197/365 [06:21<05:17,  1.89s/it][A
 54%|███████████████████████▎                   | 198/365 [06:23<05:20,  1.92s/it][A
 55%|███████████████████████▍                   | 199/365 [06:25<05:09,  1.87s/it][A
 55%|███████████████████████▌                   | 200/365 [06:27<05:11,  1.89s/it][A
 55%|███████████████████████▋                   | 201/

**Merge Station Speed Pivot Table for All Days**

In [54]:
full_speed_df = pd.concat(dfs).sort_index()
full_speed_df

Unnamed: 0,715898,716016,716023,716026,716028,716067,716069,716078,716081,716142,...,776773,776825,776826,776843,776845,776945,776981,777066,777068,777169
2021-01-01 00:00:00,70.2,65.1,68.1,65.0,68.9,67.4,68.9,71.0,68.1,69.2,...,69.3,69.1,65.7,65.3,65.0,65.0,69.8,68.6,69.1,66.0
2021-01-01 00:05:00,70.0,64.9,69.6,65.0,55.5,67.7,70.8,70.0,67.3,70.4,...,69.1,68.9,65.8,66.6,64.9,65.0,69.6,68.4,68.9,65.8
2021-01-01 00:10:00,69.9,65.2,71.2,64.9,51.0,68.3,70.9,70.3,68.3,67.8,...,69.2,69.0,65.8,66.3,65.0,65.1,69.7,68.9,69.0,65.9
2021-01-01 00:15:00,69.8,65.2,68.9,64.8,46.8,67.1,69.0,70.0,68.7,69.5,...,68.8,68.7,65.9,67.1,65.0,65.1,69.4,68.8,68.7,65.6
2021-01-01 00:20:00,69.8,65.1,67.4,64.8,63.6,66.8,68.7,70.5,68.3,67.2,...,68.8,68.6,65.6,67.8,65.0,64.9,69.3,68.9,68.6,65.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31 23:35:00,69.8,65.2,65.2,65.1,65.1,58.7,59.3,66.4,69.7,70.5,...,68.7,68.5,68.6,64.6,68.5,68.5,69.2,68.3,68.5,65.4
2021-12-31 23:40:00,69.9,65.2,65.2,65.1,65.1,72.7,46.5,67.0,70.1,69.3,...,69.0,68.8,68.9,65.7,68.8,68.8,69.5,69.3,68.8,65.7
2021-12-31 23:45:00,70.0,65.0,65.0,65.1,65.2,70.4,45.9,67.6,69.7,69.8,...,69.0,68.8,68.9,66.5,68.8,68.8,69.5,68.8,68.8,65.7
2021-12-31 23:50:00,70.0,65.2,65.2,65.1,65.1,73.4,50.6,67.3,70.1,67.6,...,69.1,68.9,69.0,65.9,68.9,68.9,69.6,68.4,68.9,65.8


**Remove Stations from Adj. Matrix that Don't Appear in Full Speed Data**

In [55]:
station_ind_mapper_subs = {station: station_ind_mapper[station] for station in station_ind_mapper if station in full_speed_df.columns}
ind_to_delete = [i for i in range(adj_mat.shape[0]) if i not in station_ind_mapper_subs.values()]
len(ind_to_delete)

668

In [56]:
adj_mat_subs = np.delete(np.delete(adj_mat, ind_to_delete, 0), ind_to_delete, 1)
adj_mat_subs.shape

(760, 760)

In [57]:
station_ind_mapper_subs = {station: i for i, station in enumerate(station_ind_mapper_subs)}

**Verify Removal Was Successful**

In [58]:
station_test, ind_test = list(station_ind_mapper_subs.items())[3] 
all(np.delete(adj_mat[station_ind_mapper[station_test], ], ind_to_delete, 0) == adj_mat_subs[ind_test, ])

True

**Ensure Speed Data Has a Consistent Frequency**

In [61]:
full_speed_df.index.value_counts()

2021-09-15 01:55:00    2
2021-02-11 00:00:00    2
2021-09-15 01:35:00    2
2021-02-11 02:55:00    2
2021-09-15 04:10:00    2
                      ..
2021-02-23 05:30:00    1
2021-05-06 17:10:00    1
2021-12-23 19:35:00    1
2021-01-29 04:25:00    1
2021-11-12 18:30:00    1
Length: 105083, dtype: int64

We see that there is data duplicated for some dates. To fix this, we will group by date and compute the mean.

In [62]:
full_speed_df = full_speed_df.groupby(full_speed_df.index).mean()
full_speed_df.index.value_counts()

2021-09-19 09:45:00    1
2021-01-18 10:40:00    1
2021-02-28 13:50:00    1
2021-10-31 07:40:00    1
2021-08-24 07:30:00    1
                      ..
2021-11-26 02:55:00    1
2021-01-19 14:40:00    1
2021-01-19 01:25:00    1
2021-08-25 01:40:00    1
2021-11-12 18:30:00    1
Length: 105083, dtype: int64

Now, there are no duplicate dates. However, we can see that some 5-min intervals are missing in the year.

In [63]:
set(pd.date_range('2021-01-01', '2022-01-01', freq='5T')) - set(full_speed_df.index)

{Timestamp('2021-03-14 02:00:00', freq='5T'),
 Timestamp('2021-03-14 02:05:00', freq='5T'),
 Timestamp('2021-03-14 02:10:00', freq='5T'),
 Timestamp('2021-03-14 02:15:00', freq='5T'),
 Timestamp('2021-03-14 02:20:00', freq='5T'),
 Timestamp('2021-03-14 02:25:00', freq='5T'),
 Timestamp('2021-03-14 02:30:00', freq='5T'),
 Timestamp('2021-03-14 02:35:00', freq='5T'),
 Timestamp('2021-03-14 02:40:00', freq='5T'),
 Timestamp('2021-03-14 02:45:00', freq='5T'),
 Timestamp('2021-03-14 02:50:00', freq='5T'),
 Timestamp('2021-03-14 02:55:00', freq='5T'),
 Timestamp('2021-10-18 05:05:00', freq='5T'),
 Timestamp('2021-12-19 22:00:00', freq='5T'),
 Timestamp('2021-12-19 22:05:00', freq='5T'),
 Timestamp('2021-12-19 22:10:00', freq='5T'),
 Timestamp('2021-12-19 22:15:00', freq='5T'),
 Timestamp('2021-12-19 22:20:00', freq='5T'),
 Timestamp('2021-12-19 22:25:00', freq='5T'),
 Timestamp('2021-12-19 22:30:00', freq='5T'),
 Timestamp('2021-12-19 22:35:00', freq='5T'),
 Timestamp('2021-12-19 22:40:00', 

In [64]:
full_speed_df = full_speed_df.reindex(pd.date_range('2021-01-01', '2022-01-01', freq='5T'))

**Imputation**

In [65]:
full_speed_df = full_speed_df.interpolate(method='linear', limit_direction='forward', axis=0)

In [66]:
# ## save adj_mat 
# with open('../data/processed/rdp_ds/adj_mat.dat', 'wb')  as f:
#     pickle.dump(adj_mat_subs, f)

# ## save mapper for adj_mat index to station ID
# with open('../data/processed/rdp_ds/adj_mat_ind_station_mapper.dat', 'wb') as f:
#     pickle.dump({station_ind_mapper_subs[station]: station for station in station_ind_mapper_subs}, f)

# ## save full speed df concatenated for all days in 2021
# with open('../data/processed/rdp_ds/speeds.dat', 'wb')  as f:
#     pickle.dump(full_speed_df, f)