# Big Data AADT Comparison Across Platforms
- with PeMS


In [1]:
# import modules
import pandas as pd
import warnings
import altair as alt
from altair_saver import save # To save the PNG file(s)

In [2]:
# Define the path to the data on Google Cloud Storage (GCS)
path = "gs://calitp-analytics-data/data-analyses/big_data/compare_traffic_counts/3_us50_all_wPeMS/"

In [3]:
# Assign the DataFrame names
df01 = f'{path}pems_output.csv' # PeMS data
df02 = f'{path}traffic_census_counts_hwy50.csv' # Traffic Census data
df03 = f'{path}replica_us_50_aadt.csv' # Replica
df04 = f'{path}stl_us_50_aadt.csv' # StreetLight

In [4]:
# create a function to import the data from a csv file
def getData_and_cleanheaders(path):
    # Suppress warning
    warnings.filterwarnings("ignore")
    
    # reads in the data from a .csv file
    df = pd.read_csv(path)
    
    # Create a function to remove spaces and make header characters lowercase
    cleaned_columns = []
    for column in df.columns:
        cleaned_column = column.replace(" ", "").lower()
        cleaned_columns.append(cleaned_column)
    df.columns = cleaned_columns
    
    return df

In [5]:
# Pull in and clean the headers
df01 = getData_and_cleanheaders(df01) # PeMS
df02 = getData_and_cleanheaders(df02) # Traffic Census
df03 = getData_and_cleanheaders(df03) # Replica
df04 = getData_and_cleanheaders(df04) # StreetLight

## Caltrans PeMS

In [6]:
# Create a subset
# filter by route - select only the 'us50' records and filter by county, select only the 'sac' and 'ed' records
df01 = df01[(df01['rte'].str.lower() == 'us50') & (df01['county'].str.lower().isin(['sac', 'ed']))]

# Create a subset, pulling in only relevant columns
df01 = df01[['rte', 'year', 'pm', 'aadt', 'dir']]

# Drop duplicates from the postmile 'pm' column
df01 = df01.drop_duplicates(subset=['pm'])

# Reset the index
df01 = df01.reset_index(drop=True)

In [7]:
# Create a function to identify the location order
def location_description(df):
    # Rename the locations and put the new names in a field called 'location'
    location_description_mapping = {
         'L2.43': 'JCT. RTES. 51/99',
         'R2.131': 'SACRAMENTO, 59TH STREET',
         'R3.674': 'JCT. RTE. 16',
         'R5.336': 'SACRAMENTO, WATT AVENUE',
         'R7.746': 'BRADSHAW ROAD',
         'R10.919': 'ZINFANDEL DRIVE',
         '17.008': 'FOLSOM BOULEVARD/NATOMA',
         'R8.564': 'SHINGLE SPRINGS',
         '16.99': 'WEST PLACERVILLE',
         '17.667': 'PLACERVILLE, JCT. RTE. 49',
         '20.741': 'NEW TOWN ROAD', 
         'R25.949': 'EAST CAMINO ROAD', 
         'R31.299': 'SLY PARK ROAD',
         '65.619': 'ECHO LAKE ROAD',
         '72.71': 'SAWMILL ROAD',
         '78.42': 'SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD',
         '80.14': 'SOUTH LAKE TAHOE, PARK AVENUE'        
    }
    
    # Apply the mapping to the 'location' column to create the 'location_order' column
    df['location_description'] = df['pm'].map(location_description_mapping)
    
    return df

# Create a function to identify the location order
def location_order(df):
    # Rename the locations and put the new names in a field called 'location'
    location_description_mapping = {
         'JCT. RTES. 51/99': 0,
         'SACRAMENTO, 59TH STREET': 1,
         'JCT. RTE. 16': 2,
         'SACRAMENTO, WATT AVENUE': 3,
         'BRADSHAW ROAD': 4,
         'ZINFANDEL DRIVE': 5,
         'FOLSOM BOULEVARD/NATOMA': 6,
         'SHINGLE SPRINGS': 7,
         'WEST PLACERVILLE': 8,
         'PLACERVILLE, JCT. RTE. 49': 9,
         'NEW TOWN ROAD': 10,
         'EAST CAMINO ROAD': 11,
         'SLY PARK ROAD': 12,
         'ECHO LAKE ROAD': 13,
         'SAWMILL ROAD': 14,
         'SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD': 15,
         'SOUTH LAKE TAHOE, PARK AVENUE': 16
    }
    
    # Apply the mapping to the 'location' column to create the 'location_order' column
    df['location_order'] = df['location_description'].map(location_description_mapping)
    
    return df

In [8]:
# Create a function to add the year, source, and route to the dataframe
def add_source_route(df, source, route):  
    # Add Source Information
    df['source'] = source
    
    # Add Route Number column called 'route'
    df['route'] = route
        
    return df

In [9]:
df01 = add_source_route(df01, 'Caltrans PeMS', '50')

In [10]:
# Add a 'location_description' column
df01 = location_description(df01)

# create a new column called 'location_order' using the 'location_order' function
df01 = location_order(df01)

# Create a subset, pulling in only relevant columns
df01 = df01[['location_order', 'year', 'source', 'route', 'location_description', 'aadt']]

# Reset the index
df01 = df01.reset_index(drop=True)

# View the dataframe
df01

Unnamed: 0,location_order,year,source,route,location_description,aadt
0,0,2007,Caltrans PeMS,50,JCT. RTES. 51/99,253094.0
1,1,2007,Caltrans PeMS,50,"SACRAMENTO, 59TH STREET",215489.0
2,2,2007,Caltrans PeMS,50,JCT. RTE. 16,186392.0
3,3,2007,Caltrans PeMS,50,"SACRAMENTO, WATT AVENUE",183867.0
4,4,2007,Caltrans PeMS,50,BRADSHAW ROAD,177096.0
5,5,2007,Caltrans PeMS,50,ZINFANDEL DRIVE,148977.0
6,6,2007,Caltrans PeMS,50,FOLSOM BOULEVARD/NATOMA,94121.0
7,7,2007,Caltrans PeMS,50,SHINGLE SPRINGS,61687.0
8,8,2003,Caltrans PeMS,50,WEST PLACERVILLE,50971.0
9,9,2006,Caltrans PeMS,50,"PLACERVILLE, JCT. RTE. 49",47457.0


## Caltrans Traffic Census
- Caltrans Traffic Census Data is from Federal FY 2022  
- geodatabase pull from Traffic Opts Open Data  
- https://gisdata-caltrans.opendata.arcgis.com/datasets/d8833219913c44358f2a9a71bda57f76_0/explore?location=41.381207%2C-119.640977%2C6.21

In [11]:
# Drop duplicates from the postmile 'pm' column
df02 = df02.drop_duplicates(subset=['location_description'])

In [12]:
# create a new column called 'location_order' using the 'location_order' function
df02 = location_order(df02)

# Reset the index
df02 = df02.reset_index(drop=True)

In [13]:
# Create a new column that averages out the back and ahead AADT numbers
def calculate_ct_tc_volume(df):
    df['aadt'] = (df['ahead_aadt'] + df['back_aadt'])/2
    
    return df

In [14]:
# Calculate the volume and create a column called 'aadt'
# This number will replace the aadt_ahead and aadt_back values with an average of the two numbers
df02 = calculate_ct_tc_volume(df02)

# Create a subset
#df02 = df02[['location_order', 'district', 'county', 'location_description', 'aadt']]

In [15]:
# Create a function to add the year, source, and route to the dataframe
def add_year_source_route(df, year, source, route):
    # Add year column
    df['year'] = year
    
    # Add Source Information
    df['source'] = source
    
    # Add Route Number column called 'route'
    df['route'] = route
        
    return df

In [16]:
df02 = add_year_source_route(df02, '2022', 'Caltrans Traffic Census', '50')

In [17]:
# Create a subset
df02 = df02[['location_order', 'year', 'source', 'route', 'location_description', 'aadt']]

In [18]:
df02

Unnamed: 0,location_order,year,source,route,location_description,aadt
0,0,2022,Caltrans Traffic Census,50,JCT. RTES. 51/99,259000.0
1,1,2022,Caltrans Traffic Census,50,"SACRAMENTO, 59TH STREET",193500.0
2,2,2022,Caltrans Traffic Census,50,JCT. RTE. 16,165000.0
3,3,2022,Caltrans Traffic Census,50,"SACRAMENTO, WATT AVENUE",165000.0
4,4,2022,Caltrans Traffic Census,50,BRADSHAW ROAD,163000.0
5,5,2022,Caltrans Traffic Census,50,ZINFANDEL DRIVE,124000.0
6,6,2022,Caltrans Traffic Census,50,FOLSOM BOULEVARD/NATOMA,98500.0
7,7,2022,Caltrans Traffic Census,50,SHINGLE SPRINGS,58500.0
8,8,2022,Caltrans Traffic Census,50,WEST PLACERVILLE,38250.0
9,9,2022,Caltrans Traffic Census,50,"PLACERVILLE, JCT. RTE. 49",35750.0


## Replica
* Cal-Nev  
* Fall 2022  
* Thursday  
* Primary Mode: "Commercial vehicle (freight)" and "Private auto"  
* 20 OSM links selected - see Study "hwy-50-corridor-fall-2022"

In [19]:
# Group by 'osmid', 'networklinkid', 'direction', 'startlat', 'startlon'
df03 = df03.groupby(['osmid', 'networklinkid', 'direction', 'startlat', 'startlon'], as_index=False).agg(
    total_trip_count=pd.NamedAgg(column='trip_count', aggfunc='sum'),
    record_count=pd.NamedAgg(column='osmid', aggfunc='count')
)

# Calculate the average trip count per location
df03['average_trip_count'] = df03['total_trip_count']/df03['record_count']

In [20]:
# Create a function to identify the location/direction
# enter the dataframe and the field that contains a unique identifier for the record
def location_and_direction(df, field):
    location_mapping = {
        9844669409414825659: 'JCT. RTES. 51/99', # West
        14636510704667988733: 'JCT. RTES. 51/99', # East
        13484951229203112000: 'SACRAMENTO, 59TH STREET', # West
        18284127690280464801: 'SACRAMENTO, 59TH STREET', # East
        18193674273232905965: 'JCT. RTE. 16', # West
        17865240122736933530: 'JCT. RTE. 16', # East
        2186279012994171824: 'SACRAMENTO, WATT AVENUE', # West
        10420820001865831876: 'SACRAMENTO, WATT AVENUE', # East
        10181297541369812528: 'BRADSHAW ROAD', # West
        13101887543186241341: 'BRADSHAW ROAD', # East
        17995907907794707383: 'ZINFANDEL DRIVE', # West
        4154735909333386911: 'ZINFANDEL DRIVE', # East
        5126772225220176667: 'FOLSOM BOULEVARD/NATOMA', # West
        4907500100653718264: 'FOLSOM BOULEVARD/NATOMA', # East
        7702300317163367080: 'SHINGLE SPRINGS', # West
        13480335378425971014: 'SHINGLE SPRINGS', # East
        14446593532653696059: 'WEST PLACERVILLE', # West
        13722559483310219280: 'WEST PLACERVILLE', # East
        2966729976564361609: 'PLACERVILLE, JCT. RTE. 49', # West
        9029321097834986439: 'PLACERVILLE, JCT. RTE. 49', # West
        1528153040233020522: 'PLACERVILLE, JCT. RTE. 49', # East
        2074798048907422565: 'PLACERVILLE, JCT. RTE. 49', # East
        4692195571451098155: 'NEW TOWN ROAD', # West
        12818323323087048917: 'NEW TOWN ROAD', # East
        10385459626225789582: 'EAST CAMINO ROAD', # West
        3996907033029603365: 'EAST CAMINO ROAD', # East
        9905724479785401242: 'SLY PARK ROAD', # West
        11948843553019158844: 'SLY PARK ROAD', # East
        13151939145806387587: 'ECHO LAKE ROAD', # West
        8898698682539946478: 'ECHO LAKE ROAD', # East
        1843897207028965223: 'SAWMILL ROAD', # West
        802188390492056562: 'SAWMILL ROAD', # East
        7178661235377613795: 'SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD', # West
        12248397220241051219: 'SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD', # East
        3865523139008938138: 'SOUTH LAKE TAHOE, PARK AVENUE', # West
        16575286444410929151: 'SOUTH LAKE TAHOE, PARK AVENUE' # East
    }
    
    # Apply the mapping to the 'networklinkid' column to create the 'ew_direction' column
    df['location_description'] = df[field].map(location_mapping)

    location_direction = {
        9844669409414825659: 'West', # 'JCT. RTES. 51/99
        14636510704667988733: 'East', # JCT. RTES. 51/99
        13484951229203112000: 'West', # 'SACRAMENTO, 59TH STREET
        18284127690280464801: 'East', # SACRAMENTO, 59TH STREET
        18193674273232905965: 'West', # JCT. RTE. 16
        17865240122736933530: 'East', # JCT. RTE. 16
        2186279012994171824: 'West', # SACRAMENTO, WATT AVENUE
        10420820001865831876: 'East', # SACRAMENTO, WATT AVENUE
        10181297541369812528: 'West', # BRADSHAW ROAD
        13101887543186241341: 'East', # BRADSHAW ROAD
        17995907907794707383: 'West', # ZINFANDEL DRIVE
        4154735909333386911: 'East', # ZINFANDEL DRIVE
        5126772225220176667: 'West', # FOLSOM BOULEVARD/NATOMA
        4907500100653718264: 'East', # FOLSOM BOULEVARD/NATOMA
        7702300317163367080: 'West', # SHINGLE SPRINGS
        13480335378425971014: 'East', # SHINGLE SPRINGS
        14446593532653696059: 'West', # WEST PLACERVILLE
        13722559483310219280: 'East', # WEST PLACERVILLE
        2966729976564361609: 'West', # PLACERVILLE, JCT. RTE. 49
        9029321097834986439: 'West', # PLACERVILLE, JCT. RTE. 49
        1528153040233020522: 'East', # PLACERVILLE, JCT. RTE. 49
        2074798048907422565: 'East', # PLACERVILLE, JCT. RTE. 49
        4692195571451098155: 'West', # NEW TOWN ROAD
        12818323323087048917: 'East', # NEW TOWN ROAD
        10385459626225789582: 'West', # EAST CAMINO ROAD
        3996907033029603365: 'East', # EAST CAMINO ROAD
        9905724479785401242: 'West', # SLY PARK ROAD
        11948843553019158844: 'East', # SLY PARK ROAD
        13151939145806387587: 'West', # ECHO LAKE ROAD
        8898698682539946478: 'East', # ECHO LAKE ROAD
        1843897207028965223: 'West', # SAWMILL ROAD
        802188390492056562: 'East', # SAWMILL ROAD
        7178661235377613795: 'West', # SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD
        12248397220241051219: 'East', # SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD
        3865523139008938138: 'West', # SOUTH LAKE TAHOE, PARK AVENUE
        16575286444410929151: 'East' # SOUTH LAKE TAHOE, PARK AVENUE
    }
    
    # Apply the mapping to the 'networklinkid' column to create the 'ew_direction' column
    df['ew_direction'] = df[field].map(location_direction)
    
    return df

In [21]:
df03 = location_and_direction(df03, 'networklinkid')

In [22]:
df03 = location_order(df03)

In [23]:
df03 = add_year_source_route(df03, '2022', 'Replica', '50')

In [24]:
# Calculate the total Traffic Volume for each location
df03['aadt'] = df03.groupby('location_description')['average_trip_count'].transform('sum')

# drop the duplicates from the 'location' volume
df03 = df03.drop_duplicates(subset=['location_description'])

# Reset the index
df03 = df03.reset_index(drop=True)


In [25]:
df03 = df03[['location_order', 'year', 'source', 'route', 'location_description', 'aadt']]
df03 = df03.sort_values(by=['location_order'])

df03.reset_index(drop=True, inplace=True)

In [26]:
df03

Unnamed: 0,location_order,year,source,route,location_description,aadt
0,0,2022,Replica,50,JCT. RTES. 51/99,141606.0
1,1,2022,Replica,50,"SACRAMENTO, 59TH STREET",202122.0
2,2,2022,Replica,50,JCT. RTE. 16,168668.0
3,3,2022,Replica,50,"SACRAMENTO, WATT AVENUE",178999.0
4,4,2022,Replica,50,BRADSHAW ROAD,177045.0
5,5,2022,Replica,50,ZINFANDEL DRIVE,151354.0
6,6,2022,Replica,50,FOLSOM BOULEVARD/NATOMA,122565.0
7,7,2022,Replica,50,SHINGLE SPRINGS,60401.0
8,8,2022,Replica,50,WEST PLACERVILLE,40867.0
9,9,2022,Replica,50,"PLACERVILLE, JCT. RTE. 49",44430.0


## StreetLight

In [27]:
# Create a function to identify the location/direction
# enter the dataframe and the field that contains a unique identifier for the record
# field == [zonename]
def location_and_direction_stl(df, field):
    location_mapping = {
        "El Dorado Freeway / 993877312 / 1": 'JCT. RTES. 51/99', # West
        "El Dorado Freeway / 974578849 / 1": 'JCT. RTES. 51/99', # East
        "El Dorado Freeway / 974240153 / 1": 'SACRAMENTO, 59TH STREET', # West
        "El Dorado Freeway / 967454408 / 1": 'SACRAMENTO, 59TH STREET', # East
        "El Dorado Freeway / 31788844 / 1": 'JCT. RTE. 16', # West
        "El Dorado Freeway / 10519644 / 2": 'JCT. RTE. 16', # East
        "El Dorado Freeway / 974578456 / 1": 'SACRAMENTO, WATT AVENUE', # West
        "El Dorado Freeway / 186082342 / 3": 'SACRAMENTO, WATT AVENUE', # East
        "El Dorado Freeway / 966695343 / 3": 'BRADSHAW ROAD', # West
        "El Dorado Freeway / 972455002 / 3": 'BRADSHAW ROAD', # East
        "El Dorado Freeway / 977786496 / 1": 'ZINFANDEL DRIVE', # West
        "El Dorado Freeway / 967558294 / 3": 'ZINFANDEL DRIVE', # East
        "El Dorado Freeway / 31998935 / 1": 'FOLSOM BOULEVARD/NATOMA', # West
        "El Dorado Freeway / 31998947 / 1": 'FOLSOM BOULEVARD/NATOMA', # East
        "US 50 / 997346450 / 1": 'SHINGLE SPRINGS', # West
        "US 50 / 39563171 / 4": 'SHINGLE SPRINGS', # East
        "US 50 / 950257854 / 1": 'WEST PLACERVILLE', # West
        "US 50 / 39573477 / 4": 'WEST PLACERVILLE', # East
        "US 50 / 824630074 / 1": 'PLACERVILLE, JCT. RTE. 49', # West
        "US 50 / 117518742 / 1": 'PLACERVILLE, JCT. RTE. 49', # East
        "US 50 / 183906398 / 1": 'NEW TOWN ROAD', # West
        "US 50 / 677756452 / 1": 'NEW TOWN ROAD', # East
        "US 50 / 183906403 / 1": 'EAST CAMINO ROAD', # West
        "US 50 / 117004868 / 2": 'EAST CAMINO ROAD', # East
        "US 50 / 138307384 / 1": 'SLY PARK ROAD', # West
        "US 50 / 1011938973 / 1": 'SLY PARK ROAD', # East
        "US 50 / 1013096010 / 1": 'ECHO LAKE ROAD', # West
        "US 50 / 401651190 / 1": 'ECHO LAKE ROAD', # East
        "US 50;CA 89 / 355391728 / 1": 'SAWMILL ROAD', # West
        "US 50;CA 89 / 355391730 / 4": 'SAWMILL ROAD', # East
        "US 50 / 500227285 / 9": 'SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD', # West
        "US 50 / 500227285 / 8": 'SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD', # East
        "US 50 / 314177418 / 4": 'SOUTH LAKE TAHOE, PARK AVENUE', # West
        "US 50 / 314177418 / 3": 'SOUTH LAKE TAHOE, PARK AVENUE' # East
    }
    
    # Apply the mapping to the 'networklinkid' column to create the 'ew_direction' column
    df['location_description'] = df[field].map(location_mapping)

    # I think I can skip this step for this dataset
    location_direction = {
        "El Dorado Freeway / 993877312 / 1": 'West', # JCT. RTES. 51/99
        "El Dorado Freeway / 974578849 / 1": 'East', # JCT. RTES. 51/99
        "El Dorado Freeway / 974240153 / 1": 'West', # SACRAMENTO, 59TH STREET
        "El Dorado Freeway / 967454408 / 1": 'East', # SACRAMENTO, 59TH STREET
        "El Dorado Freeway / 31788844 / 1": 'West', # JCT. RTE. 16
        "El Dorado Freeway / 10519644 / 2": 'East', # JCT. RTE. 16
        "El Dorado Freeway / 974578456 / 1": 'West', # SACRAMENTO, WATT AVENUE
        "El Dorado Freeway / 186082342 / 3": 'East', # SACRAMENTO, WATT AVENUE
        "El Dorado Freeway / 966695343 / 3": 'West', # BRADSHAW ROAD
        "El Dorado Freeway / 972455002 / 3": 'East', # BRADSHAW ROAD
        "El Dorado Freeway / 977786496 / 1": 'West', # ZINFANDEL DRIVE
        "El Dorado Freeway / 967558294 / 3": 'East', # ZINFANDEL DRIVE
        "El Dorado Freeway / 31998935 / 1": 'West', # FOLSOM BOULEVARD/NATOMA
        "El Dorado Freeway / 31998947 / 1": 'East', # FOLSOM BOULEVARD/NATOMA
        "US 50 / 997346450 / 1": 'West', # SHINGLE SPRINGS
        "US 50 / 39563171 / 4": 'East', # SHINGLE SPRINGS
        "US 50 / 950257854 / 1": 'West', # WEST PLACERVILLE
        "US 50 / 39573477 / 4": 'East', # WEST PLACERVILLE
        "US 50 / 824630074 / 1": 'West', # PLACERVILLE, JCT. RTE. 49
        "US 50 / 117518742 / 1": 'East', # PLACERVILLE, JCT. RTE. 49
        "US 50 / 183906398 / 1": 'West', # NEW TOWN ROAD
        "US 50 / 677756452 / 1": 'East', # NEW TOWN ROAD
        "US 50 / 183906403 / 1": 'West', # EAST CAMINO ROAD
        "US 50 / 117004868 / 2": 'East', # EAST CAMINO ROAD
        "US 50 / 138307384 / 1": 'West', # SLY PARK ROAD
        "US 50 / 1011938973 / 1": 'East', # SLY PARK ROAD
        "US 50 / 1013096010 / 1": 'West', # ECHO LAKE ROAD
        "US 50 / 401651190 / 1": 'East', # ECHO LAKE ROAD
        "US 50;CA 89 / 355391728 / 1": 'West', # SAWMILL ROAD
        "US 50;CA 89 / 355391730 / 4": 'East', # SAWMILL ROAD
        "US 50 / 500227285 / 9": 'West', # SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD
        "US 50 / 500227285 / 8": 'East', # SOUTH LAKE TAHOE, RUFUS ALLEN BOULEVARD
        "US 50 / 314177418 / 4": 'West', # SOUTH LAKE TAHOE, PARK AVENUE
        "US 50 / 314177418 / 3": 'East' # SOUTH LAKE TAHOE, PARK AVENUE
    }
    
    # Apply the mapping to the 'networklinkid' column to create the 'ew_direction' column
    df['ew_direction'] = df[field].map(location_direction)
    
    return df

In [28]:
df04 = location_and_direction_stl(df04, 'zonename')

In [29]:
df04 = location_order(df04)

In [30]:
df04 = add_year_source_route(df04, '2022', 'StreetLight', '50')

In [31]:
# Calculate the total Traffic Volume for each location
df04['aadt'] = df04.groupby('location_description')['averagedailysegmenttraffic(stlvolume)'].transform('sum')

# drop the duplicates from the 'location' volume
df04 = df04.drop_duplicates(subset=['location_description'])

# Reset the index
df04 = df04.reset_index(drop=True)

In [32]:
df04 = df04[['location_order', 'year', 'source', 'route', 'location_description', 'aadt']]
df04 = df04.sort_values(by=['location_order'])

df04.reset_index(drop=True, inplace=True)

In [33]:
df04

Unnamed: 0,location_order,year,source,route,location_description,aadt
0,0,2022,StreetLight,50,JCT. RTES. 51/99,150160
1,1,2022,StreetLight,50,"SACRAMENTO, 59TH STREET",193243
2,2,2022,StreetLight,50,JCT. RTE. 16,162535
3,3,2022,StreetLight,50,"SACRAMENTO, WATT AVENUE",166384
4,4,2022,StreetLight,50,BRADSHAW ROAD,154305
5,5,2022,StreetLight,50,ZINFANDEL DRIVE,130871
6,6,2022,StreetLight,50,FOLSOM BOULEVARD/NATOMA,99134
7,7,2022,StreetLight,50,SHINGLE SPRINGS,48885
8,8,2022,StreetLight,50,WEST PLACERVILLE,31344
9,9,2022,StreetLight,50,"PLACERVILLE, JCT. RTE. 49",33727


## Cross-Platform Comparison

In [34]:
# Create a function to rename the columns so the columns all match
def rename_col(df):
    # rename the columns
    mapping = {
        df.columns[0]: 'location_order',
        df.columns[1]: 'year',
        df.columns[2]: 'source',
        df.columns[3]: 'route',
        df.columns[4]: 'location_description',
        df.columns[5]: 'aadt'
    }
    df = df.rename(columns=mapping)
    return df

In [35]:
# rename the columns
df01 = rename_col(df01)
df02 = rename_col(df02)
df03 = rename_col(df03)
df04 = rename_col(df04)

In [36]:
def combine_dataframes(df1, df2, df3, df4):
    """
    Combine four DataFrames, selecting only common columns, 
    and resetting the index to avoid issues.
    """
    # Find common columns
    common_columns = list(set(df1.columns) & set(df2.columns) & set(df3.columns) & set(df4.columns))
    
    # Reset index to avoid duplication issues
    df1 = df1[common_columns].reset_index(drop=True)
    df2 = df2[common_columns].reset_index(drop=True)
    df3 = df3[common_columns].reset_index(drop=True)
    df4 = df4[common_columns].reset_index(drop=True)
    
    # Concatenate DataFrames
    combined_df = pd.concat([df1, df2, df3, df4], ignore_index=True)
    
    return combined_df

In [37]:
df = combine_dataframes(df01, df02, df03, df04)

In [38]:
df.shape

(68, 6)

In [39]:
# Create a CSV from the data
#df.to_csv('hwy_50_combined_aadt.csv', index=False)

In [40]:
def plot_aadt_by_location(df):
    """
    Plots a grouped bar chart showing AADT volumes by location with bars colored by source.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the data with columns 'aadt', 'source', 'year',
                       'route', 'location_order', 'location_description'.
                       
    Returns:
    alt.Chart: An Altair grouped bar chart.
    """
    
    # Create the grouped bar chart
    chart = alt.Chart(df).mark_bar().encode(
        x=alt.X('source:N', title='Source', axis=alt.Axis(labelAngle=-45)),
        y=alt.Y('aadt:Q', title='AADT Volume'),
        color='source:N',  # Color bars by the source
        column=alt.Column('location_description:N', sort=alt.EncodingSortField(field='location_order', order='ascending'), 
                          title='Location', header=alt.Header(labelAngle=-45)),
        tooltip=['location_description', 'aadt', 'source']
    ).properties(
        width=100,  # Adjusts the width of each individual column (location)
        height=400,
        title='AADT Volume by Location and Source'
    ).configure_axis(
        labelFontSize=10
    ).configure_header(
        labelFontSize=10
    )
    
    return chart

In [41]:
df_chart = plot_aadt_by_location(df)
df_chart