# A workbook looking at the possibility of parsing the [Authorized Highways] column in the all_permits data
- ns

- Originally requested by Stephen Yoon

In [1]:
# import modules
import pandas as pd
import warnings
import gcsfs
import re

In [2]:
gcs_path = "gs://calitp-analytics-data/data-analyses/big_data/freight/all_permits/"

In [3]:
file_names = ["all_permits_2023_sampleset.xlsx",
              "all_permits_2024_sampleset.xlsx"]

In [4]:
def load_excel_sheets_1(gcs_path, file_names):
    """
    Pull in the first sheet from each Excel file in GCS, add a 'year' column based on the filename,
    and remove records with NaN values in the 'permitnumber' column. Returns a concatenated DataFrame
    with data from all files.

    Parameters:
    gcs_path (str): The Google Cloud Storage path where the files are located.
    file_names (list): A list of Excel file names in the GCS path.

    Returns:
    pd.DataFrame: A single concatenated DataFrame with data from all files, a 'year' column, and
                  records with NaN values in 'permitnumber' removed.
    """
    
    # Create a Google Cloud Storage file system object
    fs = gcsfs.GCSFileSystem()
    
    # List to store all DataFrames
    df_list = []
    
    # Suppress any warnings
    warnings.filterwarnings("ignore")
    
    # Define the columns to keep
    columns_to_keep = ['permitnumber', 'year', 'permitvalidfrom', 'permitvalidto', 
                       'loaddescription', 'origin', 'destination', 'authorizedhighways']
    
    # Loop through each file in the file list
    for file in file_names:
        # Extract the year from the filename
        year = file.split('_')[2]  # Assuming the year is the third element when split by '_'
        
        # Open the file and read only the first sheet
        with fs.open(f"{gcs_path}{file}", 'rb') as f:
            df = pd.read_excel(f, sheet_name=0)  # Load only the first sheet
        
        # Clean headers by removing spaces and making characters lowercase
        df.columns = [col.replace(" ", "").lower() for col in df.columns]
        
        # Add 'year' column
        df['year'] = year
        
        # Filter columns and remove rows with NaN in 'permitnumber'
        df = df[columns_to_keep].dropna(subset=['permitnumber'])
        
        # Append to list
        df_list.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    
    return final_df

# # Parsing function for authorizedhighways
# def parse_routes(route_info):
#     segments = []
    
#     # Split based on "from" and "to"
#     raw_segments = re.split(r'\s*-\s*from\s+|\s*-\s*to\s+', route_info)
    
#     # Loop through each segment and capture details
#     for i, segment in enumerate(raw_segments):
#         if i % 2 == 0:
#             # Route segment starts with 'from'
#             entry_point = segment.strip()
#             entry_type = "from"
#         else:
#             # Route segment ends with 'to'
#             exit_point = segment.strip()
#             entry_type = "to"
        
#         # Extract highways and directions (e.g., 092E, 880N)
#         routes = re.findall(r'\b\d{3}[A-Z]?\b', segment)
        
#         # Append segment data as a dictionary
#         segments.append({
#             'entry_type': entry_type,
#             'point': entry_point if entry_type == "from" else exit_point,
#             'routes': routes
#         })
    
#     return segments

In [5]:
df = load_excel_sheets_1(gcs_path, file_names)



In [6]:
# Parsing function to create individual route locations
def parse_routes(route_info):
    segments = []
    
    # Split the data by "from" and "to" to isolate each route section
    raw_segments = re.split(r'\s*-\s*from\s+|\s*-\s*to\s+', route_info)
    
    # Process each segment
    for segment in raw_segments:
        # Find the first location and the remaining parts
        sub_segments = segment.split(' - ', 1)
        
        # First part is the main entry or exit point
        if sub_segments:
            segments.append(sub_segments[0].strip())
        
        # If there are additional route elements, split by ' - ' and add them
        if len(sub_segments) > 1:
            additional_routes = sub_segments[1].split(' - ')
            segments.extend([route.strip() for route in additional_routes])
    
    return segments

# Apply the parsing function to create lists of individual route locations
df['route_segments'] = df['authorizedhighways'].apply(parse_routes)

# Determine the maximum number of locations to create the necessary columns
max_locations = df['route_segments'].apply(len).max()

# Create new columns for each route location based on the maximum number of locations
for i in range(max_locations):
    df[f'route_location_{i}'] = df['route_segments'].apply(lambda x: x[i] if i < len(x) else None)

# Drop the temporary route_segments column
df.drop(columns=['route_segments'], inplace=True)

In [7]:
# Apply the parsing function to each row in the authorizedhighways column
#df['parsed_routes'] = df['authorizedhighways'].apply(parse_routes)

In [8]:
df.head()

Unnamed: 0,permitnumber,year,permitvalidfrom,permitvalidto,loaddescription,origin,destination,authorizedhighways,route_location_0,route_location_1,...,route_location_15,route_location_16,route_location_17,route_location_18,route_location_19,route_location_20,route_location_21,route_location_22,route_location_23,route_location_24
0,e23-013125,2023,02/15/2023,02/21/2023,75' KELLY BAR,HAYWARD,ANTELOPE,* from CLAWITER RD S/B ON RAMP - 092E - 880N -...,* from CLAWITER RD S/B ON RAMP,092E,...,080E,ANTELOPE RD exit (ANTELOPE RD N/B OFF RAMP) *,,,,,,,,
1,e23-021610,2023,03/20/2023,03/26/2023,UNLADEN 9 AXLE WITH 2 DECK INSERTS,FONTANA,ONTARIO,* from SIERRA AVE W/B ON RAMP - 015S - 060W - ...,* from SIERRA AVE W/B ON RAMP,015S,...,,,,,,,,,,
2,e23-022752,2023,03/22/2023,03/28/2023,M95 TRACKED CONVEYOR,DIXON,FRESNO,* from INDUSTRIAL WAY - 113N - 080W - 680S - 5...,* from INDUSTRIAL WAY,113N,...,,,,,,,,,,
3,e23-036568,2023,05/05/2023,05/11/2023,5 TROWEL MACHINES (END TO END) & MISC LEGAL FR...,ELK GROVE,CA/NV BORDER,* from GRANT LINE RD W/B ON RAMP - 099N - 051N...,* from GRANT LINE RD W/B ON RAMP,099N,...,,,,,,,,,,
4,e23-017248,2023,03/02/2023,03/08/2023,150H GRADER,FAIRFIELD,SARATOGA,* from AIR BASE PKWY N/B ON RAMP - 080W - 680S...,* from AIR BASE PKWY N/B ON RAMP,080W,...,,,,,,,,,,


In [9]:
df.to_csv("justlooking.csv", index=False)