# Big Data
## Traffic Counts (2022)

- Created June 2024 to compare 2022 traffic counts  
- Caltrans' latest available AADT counts are from Federal FY 2022 (Ending September 30, 2022)

## Focusing on Hwy 50

In [1]:
# import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import folium
from folium import plugins
from folium.plugins import MarkerCluster
from io import BytesIO
import base64
import altair as alt
from altair_saver import save # to save the PNG file(s)

In [2]:
from siuba import *
from calitp_data_analysis.sql import to_snakecase

In [3]:
from calitp_data_analysis import get_fs
import gcsfs as fs
fs = get_fs()
import geopandas as gpd



In [4]:
# Identify path to Google Cloud
path = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_ct_hwy50/"
path1 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_rep_hwy50/"
#path2 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/"

In [5]:
# Name the dataset(s)
ct_df01 = f'{path}ct-hwy50-traffic-counts-2022.csv' # Caltrans 2022 data
rep_df01 = f'{path1}replica-hwy50corridorfall2022-06_26_24-network_link_layer.csv' # Replica 2022 data
#stl_df01 = f'{path2}stl_hwy_50_corridor_2022_np.csv' # StreetLight 2022 data 
    # The StreetLight download did not make it easy to identify the individual locations. The locations were downloaded individually and concatenated - this takes place in the StreetLight section of the notebook

In [6]:
# create a function to import the data from a csv file
def getData(path):
    # Suppress warning
    warnings.filterwarnings("ignore")
    
    # reads in the data from a .csv file
    df = pd.read_csv(path)
    return df

In [7]:
# Pull in the data
ct_df01 = getData(ct_df01)
rep_df01 = getData(rep_df01)
#stl_df01 = getData(stl_df01) # This version of the data was not utilized

In [8]:
# Create a function to remove spaces and make header characters lowercase
def clean_headers(df):
    cleaned_columns = []
    for column in df.columns:
        cleaned_column = column.replace(" ", "").lower()
        cleaned_columns.append(cleaned_column)
    df.columns = cleaned_columns
    return df

In [9]:
# Clean the headers using the clean_headers function
ct_df01 = clean_headers(ct_df01)
rep_df01 = clean_headers(rep_df01)
#stl_df01 = clean_headers(stl_df01) # This version of the data was not utilized

## Caltrans Traffic Census Data
- Caltrans Traffic Census counts follow the Federal Fiscal Year (October 1, 2021 - September 30, 2022)

In [10]:
# Caltrans Data Cleaning:
    # The Caltrans GIS Attributes table was downloaded after the locations along highway 50 were selected
    # The ['OBJECTID'] column contained unique Object IDs, but also duplicated the values listed in the various AADT columns
    # The Post Miles listed in the ['PM'] field were duplicated. To clean this, the duplicate records (based on duplicate 'PM' values were manually removed
        # from the Excel file prior to uploading the data to Google Cloud Storage

In [11]:
# Create a subset from the Caltrans Traffic Census DataFrame
ct_df01 = ct_df01[['route', 'objectid*', 'district', 'county', 'pm', 'location_description', 'back_aadt', 'ahead_aadt']]

In [12]:
# Adding a column for the federal fiscal year
ct_df01['year'] = '2022'

# Adding a column to identify the data source
ct_df01['source'] = 'Caltrans Traffic Census'

In [13]:
# A function to filter the Caltrans Traffic Census data to only include 10 locations along Highway 50 (Folsom to Echo Summit)
def filter_ct_traffic_census(df, county_order=None):
    
    # Filter the DataFrame based on ['location_description'] values
    filtered_locations = ["NIMBUS ROAD", 
                          "PRAIRIE CITY ROAD", 
                          "LATROBE ROAD", 
                          "CAMERON PARK", 
                          "MISSOURI FLAT ROAD", 
                          "PLACERVILLE, MOSQUITO ROAD OVERHEAD (BROADWAY)",
                          "JUNCTION OLD HIGHWAY, CAMINO, WEST", 
                          "SLY PARK ROAD", 
                          "ICEHOUSE ROAD", 
                          "ECHO LAKE ROAD"]
    df = df[df['location_description'].isin(filtered_locations)] 
    
    return df

In [14]:
# Filter the Caltrans Traffic Census data to onlyu include the 'location_description' field
ct_df01_filtered = filter_ct_traffic_census(ct_df01)

In [15]:
def ct_tc_location_order(df):
    # Define the location order for the bar charts
    location_order_mapping = {
        'NIMBUS ROAD': 0,
        'PRAIRIE CITY ROAD': 1,
        'LATROBE ROAD': 2,
        'CAMERON PARK': 3,
        'MISSOURI FLAT ROAD': 4,
        'PLACERVILLE, MOSQUITO ROAD OVERHEAD (BROADWAY)': 5,
        'JUNCTION OLD HIGHWAY, CAMINO, WEST': 6,
        'SLY PARK ROAD': 7,
        'ICEHOUSE ROAD': 8,
        'ECHO LAKE ROAD': 9
    }
    
    # Apply the mapping to the 'location' column to create the 'location_order' column
    df['location_order'] = df['location_description'].map(location_order_mapping)
    
    return df

In [16]:
# create a field called 'location_order' and identify the order that the locations should appear (for the bar chart)
ct_df01_filtered = ct_tc_location_order(ct_df01_filtered)

In [17]:
def ct_tc_location_rename(df):
    # Rename the locations to match the other datasets
    location_name_mapping = {
        'NIMBUS ROAD': 'Nimbus Road',
        'PRAIRIE CITY ROAD': 'Prairie City Road',
        'LATROBE ROAD': 'Latrobe Road',
        'CAMERON PARK': 'Cameron Park',
        'MISSOURI FLAT ROAD': 'Missouri Flat Road',
        'PLACERVILLE, MOSQUITO ROAD OVERHEAD (BROADWAY)': 'Mosquito Road',
        'JUNCTION OLD HIGHWAY, CAMINO, WEST': 'Camino',
        'SLY PARK ROAD': 'Sly Park Road',
        'ICEHOUSE ROAD': 'Ice House Road',
        'ECHO LAKE ROAD': 'Echo Lake Road'
    }
    
    # Apply the mapping to the 'location_description' column to create the 'location' column
    df['location'] = df['location_description'].map(location_name_mapping)
    
    return df

In [18]:
# Create a field called 'location' that renames the default location names that were the values in the 'location_description' column
ct_df01_filtered = ct_tc_location_rename(ct_df01_filtered)

In [19]:
# Create the Altair chart
def ct_ahead_back_bar_chart(df):
    #nsure the DataFrame is sorted by the Post Mile field
    #df_sorted = df.sort_values(by='location_order')
    
    df_grouped = df.groupby(['location_order', 'location_description']).mean().reset_index()
    
    # Divide the values by 2 (they get doubled during the melting)
    #df_grouped['ahead_aadt'] = df_grouped['ahead_aadt']/2
    #df_grouped['back_aadt'] = df_grouped['back_aadt']/2
    
    
    # Melt the DataFrame to have a single column for values and another for value type
    df_melted = df_grouped.melt(id_vars=['location_description'], value_vars=['back_aadt', 'ahead_aadt'],
                        var_name='value_type', value_name='value')
    
    #Create the bar chart
    chart = alt.Chart(df_melted).mark_bar().encode(
        x=alt.X('location_description:N', title='Caltrans Traffic Census Location', sort=alt.EncodingSortField(field='location_order', order='ascending')),
        y=alt.Y('value:Q', title='Value'),
        color=alt.Color('value_type:N', title='Value Type'),
        #column=alt.Column('value_type:N', title='Value Type'),
        xOffset='value_type:N', # Offset bars by back_aadt and ahead_aadt to make them side-by-side
        tooltip=[
            alt.Tooltip('location_description:N', title='Caltrans Traffic Census Location'),
            alt.Tooltip('value_type:N', title='Value Type'),
            alt.Tooltip('value:Q', title='Value'),
        ]
    ).properties(
        width=1000, # Controls the width of each bar
        height=500 # Controls the height of the chart
    ).configure_view(
        strokeOpacity=0 # Removes gridlines for better visualization
    ).configure_axis(
        labelFontSize=12,
        titleFontSize=14
    ).configure_header(
        titleFontSize=14,
        labelFontSize=12
    )
        
    return chart

In [20]:
# Use the ct_ahead_back_bar_chart to create a bar chart that compares the ahead and back values for each location along Highway 50
ct_tc_chart = ct_ahead_back_bar_chart(ct_df01_filtered)
ct_tc_chart

In [21]:
#save(ct_tc_chart, 'chart_hwy50_ct_tc_chart.png', method='node')

In [22]:
# Create a new column that averages out the back and ahead AADT numbers
def calculate_ct_tc_volume(df):
    df['volume'] = (df['ahead_aadt'] + df['back_aadt'])/2
    
    return df

In [23]:
# Create a new column called 'volume' that averages out the back_aadt and ahead_aadt
ct_df01_filtered = calculate_ct_tc_volume(ct_df01_filtered)

In [24]:
# Create a subset from the ct_df_01_filtered dataframe
ct_df01_filtered = ct_df01_filtered[['year', 'source', 'route', 'location', 'volume', 'location_order', 'back_aadt', 'ahead_aadt']]

In [25]:
# Create the Altair chart
def ct_ahead_back_bar_chart_v1(df):
    #nsure the DataFrame is sorted by the Post Mile field
    #df_sorted = df.sort_values(by='location_order')
    
    df_grouped = df.groupby(['location_order', 'location']).mean().reset_index()
    
    # Divide the values by 2 (they get doubled during the melting)
    #df_grouped['ahead_aadt'] = df_grouped['ahead_aadt']/2
    #df_grouped['back_aadt'] = df_grouped['back_aadt']/2
    
    
    # Melt the DataFrame to have a single column for values and another for value type
    df_melted = df_grouped.melt(id_vars=['location'], value_vars=['back_aadt', 'ahead_aadt', 'volume'],
                        var_name='value_type', value_name='value')
    
    #Create the bar chart
    chart = alt.Chart(df_melted).mark_bar().encode(
        x=alt.X('location:N', title='Caltrans Traffic Census Location', sort=alt.EncodingSortField(field='location_order', order='ascending')),
        y=alt.Y('value:Q', title='Value'),
        color=alt.Color('value_type:N', title='Value Type'),
        #column=alt.Column('value_type:N', title='Value Type'),
        xOffset='value_type:N', # Offset bars by back_aadt and ahead_aadt to make them side-by-side
        tooltip=[
            alt.Tooltip('location:N', title='Caltrans Traffic Census Location'),
            alt.Tooltip('value_type:N', title='Value Type'),
            alt.Tooltip('value:Q', title='Value'),
        ]
    ).properties(
        width=1000, # Controls the width of each bar
        height=500 # Controls the height of the chart
    ).configure_view(
        strokeOpacity=0 # Removes gridlines for better visualization
    ).configure_axis(
        labelFontSize=12,
        titleFontSize=14
    ).configure_header(
        titleFontSize=14,
        labelFontSize=12
    )
        
    return chart

In [26]:
# Use the ct_ahead_back_bar_chart to create a bar chart that compares the ahead and back values for each location along Highway 50
ct_tc_chart_v1 = ct_ahead_back_bar_chart_v1(ct_df01_filtered)
ct_tc_chart_v1

In [27]:
#save(ct_tc_chart_v1, 'chart_hwy50_ct_tc_chart.png', method='node')

In [28]:
# Export Caltrans Traffic Census Data to a CSV
#ct_df01_filtered.to_csv('caltrans_tc_hwy50_2022.csv', index=False)

In [29]:
# package altair == a good package for data visualizations
# DDS has a style guide, cal-itp/data-analysis == set color scheme (diverging, different gradients)

# ipywidgets == another great option for data visualizations, although not as great for sharing (but good with sharing your screen)

## Replica Traffic Counts
- Replica Study Name == 'hwy-50-corridor-fall-2022'  
- Replica data Fall 2022  
        - Selected to best match the latest available Caltrans aadt data == 2022
- Filter  
        - (Primary Mode Filter == 'Commercial vehicle (freight)', 'Private auto')  
- Typical Day == Thursday


In [30]:
# Group by 'osmid', 'startlat', 'startlon'
rep_df01 = rep_df01.groupby(['osmid', 'networklinkid', 'direction', 'startlat', 'startlon'], as_index=False).agg(
    total_trip_count=pd.NamedAgg(column='trip_count', aggfunc='sum'),
    record_count=pd.NamedAgg(column='osmid', aggfunc='count')
)

# Calculate the average trip count per location
rep_df01['average_trip_count'] = rep_df01['total_trip_count']/rep_df01['record_count']

In [31]:
# Create a filter by Network Link ID ['networklinkid']
def filter_by_networklinkid(df):
    filtered_locations = [6318569418755860454, # Nimbus Road East
                          4919446025123668139, # Nimbus Road West
                          10164004687761220118, # Prairie City Road West
                          473377144358464128, # Prairie City Road East
                          4204181547633624892, # Latrobe West
                          13282593593483289893, # Latrobe East
                          5579769166083565573, # Cameron Park West
                          13239198527686004252, # Cameron Park East
                          16245877946723553755, # Missouri Flat Road West
                          17839000377633739362, # Missouri Flat Road East
                          16686269959836354342, # Mosquito Road West
                          658632816435451376, # Mosquito Road East
                          15035593707759148873, # Camino West
                          17049649116298182417, # Camino East
                          9905724479785401242, # Sly Park West
                          11948843553019158844, # Sly Park East
                          13112708779498904298, # Ice House West
                          11949881834776692762, # Ice House East
                          13151939145806387587, # Echo West
                          8898698682539946478, # Echo East
                         ]
    df = df[df['networklinkid'].isin(filtered_locations)]  
    
    return df

In [32]:
# Filter the data using the filter_by_networklinkid function
rep_df01_filtered = filter_by_networklinkid(rep_df01)

In [33]:
# Createa  a function to identify the location by name using the Network Link ID
def classify_location(df):
    # Define a mapping for Network Link IDs to Locations
    location_mapping = {
        6318569418755860454: 'Nimbus Road', # West
        4919446025123668139: 'Nimbus Road', # East
        10164004687761220118: 'Prairie City Road', # West
        473377144358464128: 'Prairie City Road', # East
        4204181547633624892: 'Latrobe Road', # West
        13282593593483289893: 'Latrobe Road', # East
        5579769166083565573: 'Cameron Park', # West
        13239198527686004252: 'Cameron Park', # East
        16245877946723553755: 'Missouri Flat Road', # West
        17839000377633739362: 'Missouri Flat Road', # East
        16686269959836354342: 'Mosquito Road', # West
        658632816435451376: 'Mosquito Road', # East
        15035593707759148873: 'Camino', # West
        17049649116298182417: 'Camino', # East
        9905724479785401242: 'Sly Park Road', # West
        11948843553019158844: 'Sly Park Road', # East
        13112708779498904298: 'Ice House Road', # West
        11949881834776692762: 'Ice House Road', # East
        13151939145806387587: 'Echo Lake Road', # West
        8898698682539946478: 'Echo Lake Road' # East
    }
    
    # Apply the mapping to the 'networklinkid' column to create the 'ew_direction' column
    df['location'] = df['networklinkid'].map(location_mapping)
    
    # Define a mapping for Network Link IDs to East-West Direction
    ew_direction_mapping = {
        6318569418755860454: 'West', # Nimbus Road
        4919446025123668139: 'East', # Nimbus Road
        10164004687761220118: 'West', # Prairie City Road
        473377144358464128: 'East', # Prairie City Road
        4204181547633624892: 'West', # Latrobe Road
        13282593593483289893: 'East', # Latrobe Road
        5579769166083565573: 'West', # Cameron Park
        13239198527686004252: 'East', # Cameron Park
        16245877946723553755: 'West', # Missouri Flat Road
        17839000377633739362: 'East', # Missouri Flat Road
        16686269959836354342: 'West', # Mosquito Road
        658632816435451376: 'East', # Mosquito Road
        15035593707759148873: 'West', # Camino
        17049649116298182417: 'East', # Camino
        9905724479785401242: 'West', # Sly Park Road
        11948843553019158844: 'East', # Sly Park Road
        13112708779498904298: 'West', # Ice House Road
        11949881834776692762: 'East', # Ice House Road
        13151939145806387587: 'West', # Echo Lake Road
        8898698682539946478: 'East' # Echo Lake Road
    }
    
    # Apply the mapping to the 'networklinkid' column to create the 'ew_direction' column
    df['ew_direction'] = df['networklinkid'].map(ew_direction_mapping)
    
    # Define the location order for the bar charts
    location_order_mapping = {
        'Nimbus Road': 0,
        'Prairie City Road': 1,
        'Latrobe Road': 2,
        'Cameron Park': 3,
        'Missouri Flat Road': 4,
        'Mosquito Road': 5,
        'Camino': 6,
        'Sly Park Road': 7,
        'Ice House Road': 8,
        'Echo Lake Road': 9
    }
    
    # Apply the mapping to the 'location' column to create the 'location_order' column
    df['location_order'] = df['location'].map(location_order_mapping)
    
    return df

In [34]:
# Create a new column called ['location'] using the Network Link ID to identify the location of the segment
rep_df01_filtered = classify_location(rep_df01_filtered)

In [35]:
# Add year column
rep_df01_filtered['year'] = '2022'

# Add Route column
rep_df01_filtered['route'] = '50'

rep_df01_filtered['source'] = 'Replica'

In [36]:
# A function to create a stacked bar chart showing the total Traffic Volume and identify the East/West Directions by color
def stacked_bar_chart(df):
    # Sort the DataFrame by 'location_order' to ensure the correct order
    df = df.sort_values(by='location_order')
    
    # Calculate the total Traffic Volume for each location
    df['total_volume'] = df.groupby('location')['average_trip_count'].transform('sum')
    
    # Create the bar chart
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('location:O', sort=df['location'].unique().tolist(), title='Replica Traffic Volume Locations'),
        y=alt.Y('average_trip_count:Q', title='Replica Traffic Volume'),
        #color=alt.Color('ew_direction:N', title='Direction'), # Adding color is a little confusing when comparing to the Caltrans AADT chart
        #column=alt.Column('location_order:N', header=alt.Header(title='Location Order', labelOrient='bottom'))
        #xOffset='ew_direction:N' # Offset bars by building type to make them side-by-side
        #tooltip=['location', 'average_trip_count', 'ew_direction', 'total_volume'] # Adding tooltip for better interactivity
        tooltip=['location', 'total_volume', 'ew_direction', 'average_trip_count'] # Adding tooltip for better interactivity
    ).properties(
        width=1000, # Controls the width of the chart
        height=500 # Controls the height of the chart
    ).configure_view(
        strokeOpacity=0 # Removes gridlines for better visualization
    ).configure_axis(
        labelFontSize=12,
        titleFontSize=14
    ).configure_header(
        labelFontSize=12,
        titleFontSize=14
    )
    
    return chart    

In [37]:
# Visualize the stacked bar chart, but don't show the different colors for the different directions
stacked_bar_chart(rep_df01_filtered)

In [38]:
# Calculate the total Traffic Volume for each location
rep_df01_filtered['volume'] = rep_df01_filtered.groupby('location')['average_trip_count'].transform('sum')

In [39]:
# Create a subset of the data to include only fields that are needed to create a visualization to compare volumes across platforms
rep_df01_filtered = rep_df01_filtered[['year', 'source', 'route', 'location', 'volume', 'location_order']]

In [40]:
rep_df01_filtered = rep_df01_filtered.drop_duplicates(subset=['location'])

In [41]:
#rep_df01_filtered.to_csv('replica_hwy50_2022.csv', index=False)

## StreetLight

In [42]:
# Identify path to StreetLight's 2022 Network Performance Data
# Each location was downloaded individually to make it easier to identify the locations of the records
#stl_path = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/"

# This method was not working for me at this time

In [43]:
# Identify the StreetLight 2022 Highway 50 datasets
stl_2022_df10 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760584_hwy_50_corridor_2022_Echo_network_performance.csv"
stl_2022_df09 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760586_hwy_50_corridor_2022_Ice_House_network_performance.csv"
stl_2022_df08 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760589_hwy_50_corridor_2022_Sly_Park_network_performance.csv"
stl_2022_df07 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760590_hwy_50_corridor_2022_Camino_network_performance.csv"
stl_2022_df06 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760592_hwy_50_corridor_2022_Mosquito_Road_network_performance.csv"
stl_2022_df05 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760593_hwy_50_corridor_2022_Missouri_Flat_network_performance.csv"
stl_2022_df04 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760595_hwy_50_corridor_2022_Cameron_Park_network_performance.csv"
stl_2022_df03 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760598_hwy_50_corridor_2022_Latrobe_Road_network_performance.csv"
stl_2022_df02 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760599_hwy_50_corridor_2022_Prairie_City_network_performance.csv"
stl_2022_df01 = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_stl_hwy50/2022_stl_hwy50/1760601_hwy_50_corridor_2022_Nimbus_Road_network_performance.csv"

In [44]:
# Bring in the datasets
stl_2022_df10 = getData(stl_2022_df10) # Echo Lake Road
stl_2022_df09 = getData(stl_2022_df09) # Ice House Road
stl_2022_df08 = getData(stl_2022_df08) # Sly Park Road
stl_2022_df07 = getData(stl_2022_df07) # Camino
stl_2022_df06 = getData(stl_2022_df06) # Mosquito Road
stl_2022_df05 = getData(stl_2022_df05) # Missouri Flat Road
stl_2022_df04 = getData(stl_2022_df04) # Cameron Park
stl_2022_df03 = getData(stl_2022_df03) # Latrobe Road
stl_2022_df02 = getData(stl_2022_df02) # Prairie City Road
stl_2022_df01 = getData(stl_2022_df01) # Nimbus Road

In [45]:
# Add a column called 'location' and enter the location name as the value for each dataset
stl_2022_df10['location'] = 'Echo Lake Road'
stl_2022_df09['location'] = 'Ice House Road'
stl_2022_df08['location'] = 'Sly Park Road'
stl_2022_df07['location'] = 'Camino'
stl_2022_df06['location'] = 'Mosquito Road'
stl_2022_df05['location'] = 'Missouri Flat Road'
stl_2022_df04['location'] = 'Cameron Park'
stl_2022_df03['location'] = 'Latrobe Road'
stl_2022_df02['location'] = 'Prairie City Road'
stl_2022_df01['location'] = 'Nimbus Road'

# Add a column called 'location_order' and enter the order in which the location should appear in a bar chart
stl_2022_df10['location_order'] = 0
stl_2022_df09['location_order'] = 1
stl_2022_df08['location_order'] = 2
stl_2022_df07['location_order'] = 3
stl_2022_df06['location_order'] = 4
stl_2022_df05['location_order'] = 5
stl_2022_df04['location_order'] = 6
stl_2022_df03['location_order'] = 7
stl_2022_df02['location_order'] = 8
stl_2022_df01['location_order'] = 9

In [46]:
# Concatenate the StreetLight Datasets
stl_df22 = pd.concat([stl_2022_df01, stl_2022_df02, stl_2022_df03, stl_2022_df04, stl_2022_df05, stl_2022_df06, stl_2022_df07, stl_2022_df08, stl_2022_df09, stl_2022_df10]) 

In [47]:
# Add the year to the StreetLight Dataset
stl_df22['year'] = '2022'

# Add Route column
stl_df22['route'] = '50'

# Add Source column
stl_df22['source'] = 'StreetLight'

In [48]:
# Clean the headers (remove spaces and make everything lowercase)
stl_df22 = clean_headers(stl_df22)

In [49]:
# Create a subset from the Caltrans Traffic Census DataFrame
stl_df22 = stl_df22[['year', 'route', 'source', 'zonename', 'zoneisbi-direction', 'averagedailysegmenttraffic(stlvolume)', 'location', 'location_order']]

In [50]:
# A function to create a bar chart for the StreetLight Data to show Hwy 50 Traffic Volumes
def stl_volume_bar_chart(df):
    # Sort the DataFrame by 'location_order'
    df = df.sort_values(by='location_order')
    
    # Group by 'location' and 'location_order' and sum the 'averagedailiy segmenttraffic(stlvolume)'
    df_grouped = df.groupby(['location', 'location_order']).sum().reset_index()
    
    # Create the bar chart
    chart = alt.Chart(df_grouped).mark_bar().encode(
        x=alt.X('location:N', title='StreetLight Volume Hwy 50 2022', sort=alt.EncodingSortField(field='location_order', order='descending')),
        y=alt.Y('averagedailysegmenttraffic(stlvolume):Q', title='Volume'),
        tooltip=[
            alt.Tooltip('location:N', title='StreetLight Volume 2022'),
            alt.Tooltip('averagedailysegmenttraffic(stlvolume):Q', title='Volume')
        ]
    ).properties(
        width=1000, # Controls the width of the chart
        height=500, # Controls the height of the chart
        
    ).configure_view(
        strokeOpacity=0 # Removes gridlines for better visualization
    ).configure_axis(
        labelFontSize=12,
        titleFontSize=14
    )
    
    return chart

In [51]:
stl_22_chart = stl_volume_bar_chart(stl_df22)
stl_22_chart

In [52]:
# rename the 'averagedailysegmenttraffic(stlvolume)' field's name to 'volume'
stl_df22 = stl_df22.rename(columns={'averagedailysegmenttraffic(stlvolume)': 'volume'})

In [53]:
# Create a subset from the Replica data
stl_df22 = stl_df22[['year', 'source', 'route', 'location', 'volume']]

In [54]:
# Createa  a function to identify the location by name using the Network Link ID
def classify_location(df):
    # Define the location order for the bar charts
    location_order_mapping = {
        'Nimbus Road': 0,
        'Prairie City Road': 1,
        'Latrobe Road': 2,
        'Cameron Park': 3,
        'Missouri Flat Road': 4,
        'Mosquito Road': 5,
        'Camino': 6,
        'Sly Park Road': 7,
        'Ice House Road': 8,
        'Echo Lake Road': 9
    }
    
    # Apply the mapping to the 'location' column to create the 'location_order' column
    df['location_order'] = df['location'].map(location_order_mapping)
    
    return df

In [55]:
stl_df22 = classify_location(stl_df22)

In [56]:
#stl_df22.to_csv('streetlight_hwy50_volume.csv', index=False)

## Cross-Platform Comparison

In [57]:
def combine_dataframes(df1, df2, df3):
    # Select the matching columns from each dataframe
    common_columns = list(set(df1.columns) & set(df2.columns) & set(df3.columns))
    
    # Concatenate the dataframes
    combined_df = pd.concat([df1[common_columns], df2[common_columns], df3[common_columns]], ignore_index=True)
    
    return combined_df

In [58]:
# Use the combine_dataframes function to combine the subset dataframes created for Caltrans, Replica, and StreetLight into one dataframe
combined_2022_aadt = combine_dataframes(ct_df01_filtered, rep_df01_filtered, stl_df22)

In [59]:
# use the classify_location function to classify the locations
combined_2022_aadt = classify_location(combined_2022_aadt)

In [60]:
def add_total_volume_column(df):
    # Calculate the total volume for each location by source
    total_volume = df.groupby(['location', 'source'])['volume'].transform('sum')
    df['total_volume'] = total_volume
    return df

In [61]:
# A total column was created to total up the traffic volumes
# This was needed because StreetLight provided East and West values (Highway 50 runs east-west)
combined_2022_aadt = add_total_volume_column(combined_2022_aadt)

In [62]:
# Create a subset of the combined data, this was needed to drop the 'volume' column
# This step was needed primarily because streetlight had 2 rows per location and if only one of those rows were left it would be misleading
combined_2022_aadt = combined_2022_aadt[['year', 'source', 'route', 'location', 'total_volume', 'location_order']]

In [63]:
def drop_duplicates_for_specific_source(df, source_value, column_name):
    # Separate the dataframe into two parts: one with the specific source and one without
    df_source = df[df['source'] == source_value]
    df_other = df[df['source'] != source_value]
    
    # Drop duplicates in the specified column for the specific source
    df_source = df_source.drop_duplicates(subset=[column_name])
    
    # Concatenate the two dataframes back together
    df_combined = pd.concat([df_source, df_other], ignore_index=True)
    
    return df_combined

In [64]:
# use the drop_duplicates_for_specific_source function to remove duplicate locations for the StreetLight records
combined_2022_aadt = drop_duplicates_for_specific_source(combined_2022_aadt, 'StreetLight', 'location')

In [65]:
def create_grouped_bar_chart(df):
    # Sort the DataFrame by 'location_order'
    location_order = df[['location', 'location_order']].drop_duplicates().sort_values('location_order')['location'].tolist()
        
    # Create the bar chart using Altair
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('source:N', title='Source'),
        y=alt.Y('total_volume:Q', title='Volume'),
        color='source:N',
        column=alt.Column('location:N', sort=location_order, title='Location'),
        tooltip=['location:N', 'source:N', 'total_volume:Q']
    ).properties(
        width=100,
        height=500
    ).resolve_scale(
        y='shared'
    )
    
    return chart

In [66]:
combined_2022_aadt_chart = create_grouped_bar_chart(combined_2022_aadt)
combined_2022_aadt_chart

In [67]:
#combined_2022_aadt.to_csv('combined_hwy50_volume.csv', index=False)

In [68]:
#save(combined_2022_aadt_chart, 'chart_hwy50.png', method='node')