In [None]:
test

<a id='top'></a>

# Table of Contents

1. [EDA for Station Metadata](#eda_station_meta)
2. [EDA for Speed Data](#eda_station_speed)
3. [EDA for Processed Data](#eda_processed_data)

<a id='eda_station_meta'></a>

# EDA for Station Metadata

**Load**

In [None]:
import pickle
import numpy as np

In [None]:
with open('./data/loaded/meta.dat', 'rb') as f:
    meta = pickle.load(f)
    
meta

In [None]:
import plotly.express as px

In [None]:
fwy_freq = meta['Fwy'].value_counts()
fwy_freq.index = fwy_freq.index.astype(int)
fwy_freq = fwy_freq.sort_index()
fwy_freq.index = fwy_freq.index.astype(str)
fig = px.bar(fwy_freq, title='Frequency of Stations Across Highways in LA District 7')
fig.update_xaxes(title='Fwy #')
fig.update_yaxes(title='Freq')
fig.update_layout(showlegend=False)
fig

In [None]:
fig = px.bar(meta['Dir'].value_counts(), title='Frequency of Stations of Different Highway Directions in LA District 7')
fig.update_xaxes(title='Direction')
fig.update_yaxes(title='Freq')
fig.update_layout(showlegend=False)
fig

In [None]:
fig = px.bar(meta['Type'].value_counts(), title='Frequency of Stations of Different Highway Types in LA District 7')
fig.update_xaxes(title='Type')
fig.update_yaxes(title='Freq')
fig.update_layout(showlegend=False)
fig

**Build Map Visualizing all Stations and General Statistics**

In [None]:
from folium import Map
import folium.plugins as plugins
import folium

In [None]:
dir_mapper = {'S': 'south', 'N': 'north', 'E': 'east', 'W': 'west'}
m = Map(location=(34.0522,-118.2437), tiles='https://tile.jawg.io/jawg-dark/{z}/{x}/{y}{r}.png?access-token=yxQukjQJyY3mRrF6htcGR22i1QJ6BP6wslSe2Cmq2k4aT8S0wbDtYMEaPhc8s240', attr='<a href="http://jawg.io" title="Tiles Courtesy of Jawg Maps" target="_blank">&copy; <b>Jawg</b>Maps</a> &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors', control_scale=True, max_bounds=True, prefer_canvas=True, zoom_start=10)
for ind, row in meta.iterrows():
    icon = folium.features.CustomIcon(f"./data/icons/{(dir_mapper[row['Dir']])}.png", icon_size=(15, 15))
    style = 'font-family: Impact, Haettenschweiler, "Franklin Gothic Bold", Charcoal, "Helvetica Inserat", "Bitstream Vera Sans Bold", "Arial Black", "sans serif";'
    html = f'''<body style="{style}"><div align="center", style="background-color: #BAD6FF; font-family: Arial">''' + row.to_frame().transpose()[['ID', 'Fwy', 'Length', 'Type', 'Lanes']].transpose().to_html(justify="center", header=False, index=True, index_names=False, col_space=300, classes="table-condensed table-responsive table-success") + '</div></body>' 
    popup = folium.Popup(html, max_width=300)                
    folium.Marker(row[['Latitude', 'Longitude']].values.tolist(), icon=icon, popup=popup).add_to(m)
#m.save('./plots/station_map.html')

**Downsample Stations Using RDP**

In [None]:
from simplification.cutil import (
    simplify_coords,
    simplify_coords_idx,
    simplify_coords_vw,
    simplify_coords_vw_idx,
    simplify_coords_vwp,
)
import pandas as pd

In [None]:
ds_coords = pd.DataFrame(simplify_coords(meta[["Latitude", 'Longitude']].values, 0.01), columns=['Latitude', 'Longitude'])
ds_coords

**Downsample Stations Using Rules**

All Stations

In [None]:
m = Map(location=(34.0522,-118.2437), tiles='https://tile.jawg.io/jawg-dark/{z}/{x}/{y}{r}.png?access-token=yxQukjQJyY3mRrF6htcGR22i1QJ6BP6wslSe2Cmq2k4aT8S0wbDtYMEaPhc8s240', attr='<a href="http://jawg.io" title="Tiles Courtesy of Jawg Maps" target="_blank">&copy; <b>Jawg</b>Maps</a> &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors', control_scale=True, max_bounds=True, prefer_canvas=True, zoom_start=10)
for ind, row in meta[meta['Dir'] == 'N'].iterrows():
    icon = folium.features.CustomIcon(f"./data/icons/north.png", icon_size=(15, 15))
    style = 'font-family: Impact, Haettenschweiler, "Franklin Gothic Bold", Charcoal, "Helvetica Inserat", "Bitstream Vera Sans Bold", "Arial Black", "sans serif";'
    html = f'''<body style="{style}"><div align="center", style="background-color: #BAD6FF; font-family: Arial">''' + row.to_frame().transpose()[['ID', 'Fwy', 'Length', 'Type', 'Lanes']].transpose().to_html(justify="center", header=False, index=True, index_names=False, col_space=300, classes="table-condensed table-responsive table-success") + '</div></body>' 
    popup = folium.Popup(html, max_width=300)                
    folium.Marker(row[['Latitude', 'Longitude']].values.tolist(), icon=icon, popup=popup).add_to(m)

After Downsampling

In [None]:
# 1 deg lat = 69 mi
# 1 deg long = 54 mi
distance_between_stations = meta[meta['Dir'] == 'N'].sort_values(['Latitude', 'Longitude'])[['Latitude', 'Longitude']].diff()
stations = distance_between_stations[((distance_between_stations['Latitude'] * 69) > 0.1) | ((distance_between_stations['Longitude'] * 54) > 0.1)].index.tolist()
#stations = set(stations).union(set([s - 1 for s in stations]))
len(stations)

In [None]:
m = Map(location=(34.0522,-118.2437), tiles='https://tile.jawg.io/jawg-dark/{z}/{x}/{y}{r}.png?access-token=yxQukjQJyY3mRrF6htcGR22i1QJ6BP6wslSe2Cmq2k4aT8S0wbDtYMEaPhc8s240', attr='<a href="http://jawg.io" title="Tiles Courtesy of Jawg Maps" target="_blank">&copy; <b>Jawg</b>Maps</a> &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors', control_scale=True, max_bounds=True, prefer_canvas=True, zoom_start=10)
for ind, row in meta[meta.index.isin(stations)].iterrows():
    icon = folium.features.CustomIcon(f"./data/icons/north.png", icon_size=(15, 15))
    style = 'font-family: Impact, Haettenschweiler, "Franklin Gothic Bold", Charcoal, "Helvetica Inserat", "Bitstream Vera Sans Bold", "Arial Black", "sans serif";'
    html = f'''<body style="{style}"><div align="center", style="background-color: #BAD6FF; font-family: Arial">''' + row.to_frame().transpose()[['ID', 'Fwy', 'Length', 'Type', 'Lanes']].transpose().to_html(justify="center", header=False, index=True, index_names=False, col_space=300, classes="table-condensed table-responsive table-success") + '</div></body>' 
    popup = folium.Popup(html, max_width=300)                
    folium.Marker(row[['Latitude', 'Longitude']].values.tolist(), icon=icon, popup=popup).add_to(m)

<a id='eda_station_speed'></a>

# EDA for Speed Data

**Data Dictionary**

In [None]:
from glob import glob
files = glob('./data/pems/*.gz')
len(files)

In [None]:
data_dict = pd.read_csv('./data/pems/station_dict.csv', names=['Feature', 'Description', 'Unit']).iloc[1:].reset_index(drop=True)
data_dict

**Missing Values**

In [None]:
import gzip
from collections import Counter

all_stations_missing = {}
for attr in ["Total Flow", "Avg Occupancy", "Avg Speed"]:
    all_stations_missing[attr] = Counter()

missing_vals = {}
total_vals = 0

for i in range(len(files)):
    print(f'Filename: {files[i]}')
    print(f'Total Values Loaded: {total_vals}')
    with gzip.open(files[i]) as f:
        df = pd.read_csv(f, index_col=False, names=data_dict['Feature'].values.tolist())[['Timestamp', 'Station', 'Station Length', 'Samples', 'Total Flow', 'Avg Occupancy', 'Avg Speed', 'Lane N Samples', 'Lane N Flow', 'Lane N Avg Occ', 'Lane N Avg Speed', 'Lane N Observed']]
        # df = pd.read_csv(f, index_col=False, usecols=range(17), header=None)
        # df = df.set_axis(data_dict['Feature'].values.tolist(), axis=1)[['Timestamp', 'Station', 'Station Length', 'Samples', 'Total Flow', 'Avg Occupancy', 'Avg Speed', 'Lane N Samples', 'Lane N Flow', 'Lane N Avg Occ', 'Lane N Avg Speed', 'Lane N Observed']]
    missing_vals[files[i]] = {}
    for attr in ["Total Flow", "Avg Occupancy", "Avg Speed"]:
        total_vals += len(df)
        missing_vals[files[i]][attr] = df[attr].isnull().sum()
        stations_missing = Counter(dict(df['Station'][df[attr].isnull()].value_counts()))
        all_stations_missing[attr] = all_stations_missing[attr] + stations_missing
       
    

In [None]:
total_missing_flow = sum(day["Total Flow"] for k, day in missing_vals.items())
total_missing_speed = sum(day["Avg Speed"] for k, day in missing_vals.items())
total_missing_occ = sum(day["Avg Occupancy"] for k, day in missing_vals.items())

print(f'Total Missing Flow Values: {total_missing_flow}')
print(f'Total Missing Speed Values: {total_missing_speed}')
print(f'Total Missing Occupancy Values: {total_missing_occ}')

print(f'Proporation of Missing Speed Values: {total_missing_speed / total_vals}')
print(f'Proportion of Missing Flow Values: {total_missing_flow / total_vals}')

In [None]:
import matplotlib.pyplot as plt

missing_ref = all_stations_missing['Avg Occupancy']

for station in df['Station']:
    if station not in missing_ref:
        missing_ref[station] = 0

n, bins, patches = plt.hist(missing_ref.values(), 15, facecolor='blue', alpha=0.5)
plt.xlabel('Number of Missing Values')
plt.ylabel('Station Frequency')
plt.title('Histogram of Missing Values Per Station')
plt.show()

**Top 5 Sensors w/ Highest Avg. Speed on 01/05/2021**

In [None]:
import gzip
with gzip.open(files[0]) as f: # load one file just to explore
    df = pd.read_csv(f, index_col=False, names=data_dict['Feature'].values.tolist())[['Timestamp', 'Station', 'Station Length', 'Samples', 'Total Flow', 'Avg Occupancy', 'Avg Speed', 'Lane N Samples', 'Lane N Flow', 'Lane N Avg Occ', 'Lane N Avg Speed', 'Lane N Observed']]
    # df = pd.read_csv(f, index_col=False, usecols=range(17), header=None)
    # df = df.set_axis(data_dict['Feature'].values.tolist(), axis=1)[['Timestamp', 'Station', 'Station Length', 'Samples', 'Total Flow', 'Avg Occupancy', 'Avg Speed', 'Lane N Samples', 'Lane N Flow', 'Lane N Avg Occ', 'Lane N Avg Speed', 'Lane N Observed']]
df.head()

In [None]:
dir_mapper = {'N': 'north', 'E': 'east', 'S': 'south', 'W': 'west'}

In [None]:
high_speed_sensors = df.groupby('Station')['Avg Speed'].mean().dropna().sort_values().tail(5)
high_speed_sensors_meta = meta.merge(high_speed_sensors, left_on='ID', right_index=True)

m = Map(location=(34.0522,-118.2437), tiles='https://tile.jawg.io/jawg-dark/{z}/{x}/{y}{r}.png?access-token=yxQukjQJyY3mRrF6htcGR22i1QJ6BP6wslSe2Cmq2k4aT8S0wbDtYMEaPhc8s240', attr='<a href="http://jawg.io" title="Tiles Courtesy of Jawg Maps" target="_blank">&copy; <b>Jawg</b>Maps</a> &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors', control_scale=True, max_bounds=True, prefer_canvas=True, zoom_start=10)
for ind, row in high_speed_sensors_meta.iterrows():
    icon = folium.features.CustomIcon(f"./data/icons/{dir_mapper[row['Dir']]}.png", icon_size=(15, 15))
    style = 'font-family: Impact, Haettenschweiler, "Franklin Gothic Bold", Charcoal, "Helvetica Inserat", "Bitstream Vera Sans Bold", "Arial Black", "sans serif";'
    html = f'''<body style="{style}"><div align="center", style="background-color: #BAD6FF; font-family: Arial">''' + row.to_frame().transpose()[['ID', 'Fwy', 'Length', 'Type', 'Lanes', 'Avg Speed']].transpose().to_html(justify="center", header=False, index=True, index_names=False, col_space=300, classes="table-condensed table-responsive table-success") + '</div></body>' 
    popup = folium.Popup(html, max_width=300)                
    folium.Marker(row[['Latitude', 'Longitude']].values.tolist(), icon=icon, popup=popup).add_to(m)

**Top 5 Sensors w/ Lowest Avg. Speed on 01/05/2021**

In [None]:
low_speed_sensors = df.groupby('Station')['Avg Speed'].mean().dropna().sort_values().head(5)
low_speed_sensors_meta = meta.merge(low_speed_sensors, left_on='ID', right_index=True)

m = Map(location=(34.0522,-118.2437), tiles='https://tile.jawg.io/jawg-dark/{z}/{x}/{y}{r}.png?access-token=yxQukjQJyY3mRrF6htcGR22i1QJ6BP6wslSe2Cmq2k4aT8S0wbDtYMEaPhc8s240', attr='<a href="http://jawg.io" title="Tiles Courtesy of Jawg Maps" target="_blank">&copy; <b>Jawg</b>Maps</a> &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors', control_scale=True, max_bounds=True, prefer_canvas=True, zoom_start=10)
for ind, row in low_speed_sensors_meta.iterrows():  
    icon = folium.features.CustomIcon(f"./data/icons/{dir_mapper[row['Dir']]}.png", icon_size=(15, 15))
    style = 'font-family: Impact, Haettenschweiler, "Franklin Gothic Bold", Charcoal, "Helvetica Inserat", "Bitstream Vera Sans Bold", "Arial Black", "sans serif";'
    html = f'''<body style="{style}"><div align="center", style="background-color: #BAD6FF; font-family: Arial">''' + row.to_frame().transpose()[['ID', 'Fwy', 'Length', 'Type', 'Lanes', 'Avg Speed']].transpose().to_html(justify="center", header=False, index=True, index_names=False, col_space=300, classes="table-condensed table-responsive table-success") + '</div></body>' 
    popup = folium.Popup(html, max_width=300)                
    folium.Marker(row[['Latitude', 'Longitude']].values.tolist(), icon=icon, popup=popup).add_to(m)


**How Does Speed Change Over Time for a Particular Station on 01/05/2021?**

In [None]:
# find station that has most speed readings
station_most_speed_readings = df.groupby('Station')['Avg Speed'].count().idxmax()
df[df['Station'] == station_most_speed_readings].head()

In [None]:
import plotly.express as px

px.line(df[df['Station'] == station_most_speed_readings], x='Timestamp', y='Avg Speed', title=f'Avg Speed vs. Time for Station {station_most_speed_readings}')

In [None]:
# same thing as above for station 760375
df.loc[df['Station'] == 760375, ['Timestamp', 'Avg Speed']].set_index('Timestamp').plot(figsize=[20, 5], title='Average Speed', color='tab:blue')


**How Does Occupancy Change Over Time for a Particular Station?**

In [None]:
df.loc[df['Station'] == 760375, ['Timestamp', 'Avg Occupancy']].set_index('Timestamp').plot(figsize=[20, 5], title='Average Occupancy', color='tab:green')


**How Does Flow Change Over Time for a Particular Station**

In [None]:
df.loc[df['Station'] == 760375, ['Timestamp', 'Total Flow']].set_index('Timestamp').plot(figsize=[20, 5], title='Flow', color='tab:orange')


**How Speed Changes with PM?**

In [None]:
def map_to_time_group(time):
    if time.hour >= 7 and time.hour <= 9:
        return 'Morning Commute'
    elif time.hour >= 16 and time.hour <= 18:
        return 'Evening Commute'
    else:
        return None
    
hwy_5_nb_stations_pm = meta.loc[(meta['Fwy'] == 5) & (meta['Dir'] == 'N'), ['ID', 'Abs_PM']].drop_duplicates()
route = df[df['Station'].isin(hwy_5_nb_stations_pm['ID'])].reset_index(drop=True)
route = route.merge(hwy_5_nb_stations_pm, left_on='Station', right_on='ID')
route['Time Group'] = pd.to_datetime(route['Timestamp']).map(map_to_time_group)

In [None]:
route[route['Time Group'] == 'Evening Commute'].plot.scatter(
    'Abs_PM', 'Avg Speed', color='tab:purple', label='Evening Commute', 
    figsize=(20,5), title='Speed Along Fwy 5 Route (6/1/21 4pm-7pm)'
)

**Speed vs. Time of Day for Month of June**

In [None]:
from tqdm.notebook import tqdm

In [None]:
# load data for entire month of June
data_list = []
for i in tqdm( range(151, 181) ):
    with gzip.open(files[i]) as f:
        df = pd.read_csv(f, index_col=False, names=colnames)[colnames_subset]
        # df = pd.read_csv(f, index_col=False, usecols=range(17), header=None)
        # df = df.set_axis(data_dict['Feature'].values.tolist(), axis=1)[['Timestamp', 'Station', 'Station Length', 'Samples', 'Total Flow', 'Avg Occupancy', 'Avg Speed', 'Lane N Samples', 'Lane N Flow', 'Lane N Avg Occ', 'Lane N Avg Speed', 'Lane N Observed']]

        df['Timestamp'] = pd.to_datetime(df['Timestamp'])

        sub = df[[x in northbound_stations for x in df['Station']]]
        sub = pd.DataFrame(sub.reset_index(drop=True))
        sub = sub.merge(meta[['ID', 'Abs_PM']], left_on='Station', right_on='ID')
        data_list.append(sub)   
df = pd.concat(data_list)

In [None]:
# compute average speed for different hours of the day along the route
d = {}
for h in [9, 12, 15, 18, 21]:
    d[h] = df[[x.hour == h for x in df['Timestamp']]].groupby('Abs_PM').mean()['Avg Speed']

pd.DataFrame(d).plot(figsize=(20,5), title='Speed Along Fwy 5 During Different Times of Day')

**Correlations**

In [None]:
import numpy as np

In [None]:
# correlation between speed and occupancy
corrs = []
for s in [760375, 769926, 769402, 776657, 716237, 774946, 771690, 716312]:
    df_sub = df[df['Station'] == s]
    corr = df_sub[['Avg Speed', 'Avg Occupancy']].corr()['Avg Speed']['Avg Occupancy']
    corrs.append(corr)

print(np.mean(corrs))

In [None]:
# correlation between rush hour and occupancy
# rush hour is an indicator of timestamp during morning or evening commute
corrs = []
for s in [760375, 769926, 769402, 776657, 716237, 774946, 771690, 716312]:
    df_sub = pd.DataFrame(df[df['Station'] == s])
    df_sub['Rush Hour'] = [1 if (x.hour >= 7 and x.hour <= 9) or (x.hour >= 16 and x.hour <= 18) else 0 for x in pd.to_datetime(df_sub['Timestamp'])]
    corr = df_sub[['Rush Hour', 'Avg Occupancy']].corr()['Rush Hour']['Avg Occupancy']
    corrs.append(corr)

print(np.mean(corrs))

In [None]:
import statsmodels.api as sm
import scipy.stats as stats

In [None]:
reg_data = route[['Abs_PM', 'Avg Speed']].dropna()
model = sm.OLS(reg_data['Abs_PM'], reg_data['Avg Speed'])
results = model.fit()
slope = results.params[0]

In [None]:
t_value = (slope / results.bse) #bse = standard error
p_value = stats.t.sf(t_value, results.df_resid) #calculate p-value from t-statistic lookup tables
p_value

In [None]:
from datetime import datetime

In [None]:
reg_data = route[['Timestamp', 'Avg Speed']].dropna()
reg_data['Timestamp'] = pd.to_datetime(reg_data['Timestamp']).map(lambda x: datetime.timestamp(x)*1000)

model = sm.OLS(reg_data['Timestamp'], reg_data['Avg Speed'])
results = model.fit()
slope = results.params[0]

In [None]:
t_value = (slope / results.bse) #bse = standard error
p_value = stats.t.sf(t_value, results.df_resid) #calculate p-value from t-statistic lookup tables
p_value

<a id='eda_processed_data'></a>

# EDA for Processed Data

In [None]:
import pickle
import pandas as pd
import plotly.express as px

In [None]:
 with open('./data/processed/adj_mat.dat', 'rb')  as f:
    adj_mat = pickle.load(f)

with open('./data/processed/adj_mat_ind_station_mapper.dat', 'rb') as f:
    ind_station_mapper = pickle.load(f)

with open('./data/processed/speeds.dat', 'rb') as f:
    speed_df = pickle.load(f)

**Stations in Processed Data**

In [None]:
with open('./data/loaded/meta.dat', 'rb') as f:
    meta = pickle.load(f)
meta_subs = meta[meta['ID'].isin(ind_station_mapper.values())].reset_index(drop=True)
dir_mapper = {'N': 'north', 'E': 'east', 'S': 'south', 'W': 'west'}

m = Map(location=(34.0522,-118.2437), tiles='https://tile.jawg.io/jawg-dark/{z}/{x}/{y}{r}.png?access-token=yxQukjQJyY3mRrF6htcGR22i1QJ6BP6wslSe2Cmq2k4aT8S0wbDtYMEaPhc8s240', attr='<a href="http://jawg.io" title="Tiles Courtesy of Jawg Maps" target="_blank">&copy; <b>Jawg</b>Maps</a> &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors', control_scale=True, max_bounds=True, prefer_canvas=True, zoom_start=10)
for ind, row in meta_subs.iterrows():  
    icon = folium.features.CustomIcon(f"./data/icons/{dir_mapper[row['Dir']]}.png", icon_size=(15, 15))
    style = 'font-family: Impact, Haettenschweiler, "Franklin Gothic Bold", Charcoal, "Helvetica Inserat", "Bitstream Vera Sans Bold", "Arial Black", "sans serif";'
    html = f'''<body style="{style}"><div align="center", style="background-color: #BAD6FF; font-family: Arial">''' + row.to_frame().transpose()[['ID', 'Fwy', 'Length', 'Type', 'Lanes']].transpose().to_html(justify="center", header=False, index=True, index_names=False, col_space=300, classes="table-condensed table-responsive table-success") + '</div></body>' 
    popup = folium.Popup(html, max_width=300)                
    folium.Marker(row[['Latitude', 'Longitude']].values.tolist(), icon=icon, popup=popup).add_to(m)
#m.save('./plots/stations_processed.html')

In [None]:
display(speed_df.apply(lambda x: (x.isna().sum() / speed_df.shape[0])*100).describe()) # descriptive stats

fig = px.violin(speed_df.apply(lambda x: (x.isna().sum() / speed_df.shape[0]) * 100), title='Distribution of % of Missing Speeds for Stations')
fig.update_xaxes(title="")
fig.update_yaxes(title="%")

In [None]:
fig = px.box(speed_df.groupby([speed_df.index.month, speed_df.index.day]).mean().mean(axis=1), title='Distribution of Avg. Speed/Day in 2021')
fig.update_xaxes(title="")
fig.update_yaxes(title="Speed (mph)")

In [None]:
fig = px.violin(speed_df.sample(5).values.flatten(), title='Distribution of Speed in 2021')
fig.update_xaxes(title="")
fig.update_yaxes(title="Speed (mph)")

**[Navigate to the Top](#top)**