# Data Preparation

**Load Station Metadata**

In [1]:
import pickle
import pandas as pd

In [2]:
with open('../data/loaded/meta.dat', 'rb') as f:
    meta = pickle.load(f)
meta

Unnamed: 0,ID,Fwy,Dir,Latitude,Longitude,Length,Type,Lanes,Abs_PM
0,715898,5,S,33.880183,-118.021787,0.430,ML,3,117.280
1,715900,5,S,33.882892,-118.026822,,OR,1,117.630
2,715901,5,N,33.883400,-118.027451,,OR,1,117.743
3,715903,5,N,33.886992,-118.034125,,OR,1,118.193
4,715904,5,S,33.892489,-118.044573,,OR,1,118.840
...,...,...,...,...,...,...,...,...,...
4878,777066,405,S,34.161152,-118.469631,0.455,ML,3,63.272
4879,777067,405,N,34.161136,-118.469426,0.790,HV,1,63.272
4880,777068,405,S,34.161152,-118.469631,0.790,HV,1,63.272
4881,777168,10,E,34.068993,-118.149756,0.450,HV,1,20.897


**Downsample Stations Based on RDP**

In [3]:
from simplification.cutil import simplify_coords

ds_coords = pd.DataFrame(simplify_coords(meta[["Latitude", 'Longitude']].values, 0.05), columns=['Latitude', 'Longitude'])
ds_coords

Unnamed: 0,Latitude,Longitude
0,33.880183,-118.021787
1,34.111314,-118.264526
2,34.120764,-117.888430
3,34.118291,-118.270166
4,34.358847,-118.554053
...,...,...
687,34.055122,-118.212293
688,33.873704,-118.219282
689,34.024458,-118.239564
690,34.161152,-118.469631


**Compute Pairwise Distances Based on Haversine Distance**

In [4]:
from haversine import haversine
from tqdm.notebook import tqdm
import time

In [None]:
#initializing progress bar objects
ds_stations = meta.merge(ds_coords, on=['Latitude', 'Longitude'])[['ID', 'Latitude', 'Longitude']]
outer_loop=tqdm(ds_stations)
inner_loop=tqdm(ds_stations)
id1 = []
id2 = []
distances = []
for ind1, row1 in ds_stations.iterrows():
    inner_loop.refresh()  #force print final state
    inner_loop.reset()  #reuse bar
    outer_loop.update() #update outer tqdm
    for ind2, row2 in ds_stations.iterrows():
        if ind1 == ind2: continue
        id1.append(row1['ID'])
        id2.append(row2['ID'])
        dist = haversine((row1['Latitude'], row1['Longitude']), (row2['Latitude'], row2['Longitude']))
        distances.append(dist)
        inner_loop.update() #update inner tqdm


In [48]:
pairwise_dist = pd.DataFrame({'Station 1': id1, 'Station 2': id2, 'Distance': distances})
pairwise_dist['Station 1'] = pairwise_dist['Station 1'].astype(int)
pairwise_dist['Station 2'] = pairwise_dist['Station 2'].astype(int)
pairwise_dist

Unnamed: 0,Station 1,Station 2,Distance
0,715898,715910,6.220256
1,715898,759518,6.220256
2,715898,715928,12.980418
3,715898,716895,12.980418
4,715898,715935,15.814186
...,...,...,...
2898501,777169,776843,24.049210
2898502,777169,776845,24.049210
2898503,777169,776981,9.641063
2898504,777169,777066,31.149929


**Construct Adjacency Matrix**

In [76]:
## mapper for station to adj_mat indices
station_ind_mapper = {station: ind for ind, station in enumerate(pairwise_dist['Station 1'].sort_values().unique())}
len(station_ind_mapper)

1428

In [17]:
import numpy as np
from tqdm import tqdm

In [87]:
adj_mat = np.ones((pairwise_dist['Station 1'].nunique(), pairwise_dist['Station 1'].nunique()))
progress = tqdm(range(adj_mat.shape[0]**2*2))
for ind, row in pairwise_dist.iterrows():
    adj_mat[station_ind_mapper[row['Station 1']], station_ind_mapper[row['Station 2']]] = row['Distance']
    progress.update()
    
for ind, row in pairwise_dist.iterrows():
    adj_mat[station_ind_mapper[row['Station 2']], station_ind_mapper[row['Station 1']]] = row['Distance']
    progress.update()

HBox(children=(FloatProgress(value=0.0, max=4078368.0), HTML(value='')))

**Load Station Speed Data**

In [8]:
from glob import glob
files = glob('../data/pems/*.gz')
len(files)

365

In [99]:
data_dict = pd.read_csv('../data/pems/station_dict.csv', names=['Feature', 'Description', 'Unit']).iloc[1:].reset_index(drop=True)

**Construct Pivot Tables for Each Day For All DS Stations**

In [None]:
import gzip
dfs = []
for file in tqdm(files):
    with gzip.open(file) as f:
        df = pd.read_csv(f, header=None, usecols=[0, 1, 11])
        df = df[df[1].isin(ds_stations['ID'])]
        pivoted_df = df.pivot_table(values=11, index=0, columns=1)
        pivoted_df.index.name = None
        pivoted_df.columns.name = None
        pivoted_df.index = pd.to_datetime(pivoted_df.index)
        dfs.append(pivoted_df)

**Merge Station Speed Pivot Table for All Days**

In [None]:
full_speed_df = pd.concat(dfs).sort_index()
full_speed_df

**Remove Stations from Adj. Matrix that Don't Appear in Full Speed Data**

In [100]:
station_ind_mapper_subs = {station: station_ind_mapper[station] for station in station_ind_mapper if station in full_speed_df.columns}
ind_to_delete = [i for i in range(adj_mat.shape[0]) if i not in station_ind_mapper_subs.values()]
len(ind_to_delete)

668

In [101]:
adj_mat_subs = np.delete(np.delete(adj_mat, ind_to_delete, 0), ind_to_delete, 1)
adj_mat_subs.shape

(760, 760)

In [102]:
station_ind_mapper_subs = {station: i for i, station in enumerate(station_ind_mapper_subs)}

**Verify Removal Was Successful**

In [103]:
station_test, ind_test = list(station_ind_mapper_subs.items())[3] 
all(np.delete(adj_mat[station_ind_mapper[station_test], ], ind_to_delete, 0) == adj_mat_subs[ind_test, ])

True

**Imputation**

In [19]:
full_speed_df = full_speed_df.fillna(method='ffill')

In [20]:
## save adj_mat 
with open('../data/processed/rdp_ds/adj_mat.dat', 'wb')  as f:
    pickle.dump(adj_mat_subs, f)

## save mapper for adj_mat index to station ID
with open('../data/processed/rdp_ds/adj_mat_ind_station_mapper.dat', 'wb') as f:
    pickle.dump({station_ind_mapper_subs[station]: station for station in station_ind_mapper_subs}, f)

## save full speed df concatenated for all days in 2021
with open('../data/processed/rdp_ds/speeds.dat', 'wb')  as f:
    pickle.dump(full_speed_df, f)