# Oversized Overweight Vehicle Permit Route Parsing Concept Validation  
* parsing the ['authorizedhighways'] column in the all_permits data for OSOW vehicle permits

- Developed by the Caltrans Data and Digital Services Office of Big Data and GeoAnalytics

- Originally requested by Stephen Yoon  
    - Original data provided by Stephen's office

In [1]:
# import modules
import pandas as pd
import warnings
import gcsfs
import re

In [2]:
# pull in the coordinates from the utils docs
from osow_frp_o_d_utils_v3 import origin_intersections, destination_intersections
from shs_intersections_utils import shs_intersections


In [3]:
# the original_mapping is needed to standardize the highway names for the various records 
original_mapping = {
    "5": "I-5", "10": "I-10", "15": "I-15", "8": "I-8", "40": "I-40", "80": "I-80", "105": "I-105", "110": "I-110",
    "205": "I-205", "210": "I-210", "215": "I-215", "280": "I-280", "380": "I-380", "405": "I-405",
    "505": "I-505", "580": "I-580", "605": "I-605", "680": "I-680", "710": "I-710", "805": "I-805",
    "880": "I-880", "980": "I-980", "1": "SR-1", "2": "SR-2", "3": "SR-3", "4": "SR-4", "7": "SR-7",
    "9": "SR-9", "11": "SR-11", "12": "SR-12", "13": "SR-13", "14": "SR-14", 
    #"15": "SR-15",
    "16": "SR-16", "17": "SR-17", "18": "SR-18", "20": "SR-20", "22": "SR-22", "23": "SR-23",
    "24": "SR-24", "25": "SR-25", "26": "SR-26", "27": "SR-27", "28": "SR-28", "29": "SR-29",
    "32": "SR-32", "33": "SR-33", "34": "SR-34", "35": "SR-35", "36": "SR-36", "37": "SR-37",
    "38": "SR-38", "39": "SR-39", "41": "SR-41", "43": "SR-43", "44": "SR-44", "45": "SR-45",
    "46": "SR-46", "seaside highway": "SR-47", "47": "SR-47", "49": "SR-49", "51": "SR-51", "52": "SR-52", "53": "SR-53",
    "54": "SR-54", "55": "SR-55", "56": "SR-56", "57": "SR-57", "58": "SR-58", "59": "SR-59",
    "60": "SR-60", "61": "SR-61", "62": "SR-62", "63": "SR-63", "65": "SR-65", "66": "SR-66",
    "67": "SR-67", "68": "SR-68", "70": "SR-70", "71": "SR-71", "72": "SR-72", "73": "SR-73",
    "74": "SR-74", "75": "SR-75", "76": "SR-76", "77": "SR-77", "78": "SR-78", "79": "SR-79",
    "82": "SR-82", "83": "SR-83", "84": "SR-84", "85": "SR-85", "86": "SR-86", "87": "SR-87",
    "88": "SR-88", "89": "SR-89", "90": "SR-90", "91": "SR-91", "92": "SR-92", "94": "SR-94",
    "96": "SR-96", "98": "SR-98", "imperial highway": "SR-99", "99": "SR-99", "103": "SR-103", "104": "SR-104", "107": "SR-107",
    "108": "SR-108", "109": "SR-109", "110": "SR-110", "111": "SR-111", "112": "SR-112",
    "113": "SR-113", "114": "SR-114", "115": "SR-115", "116": "SR-116", "118": "SR-118",
    "119": "SR-119", "120": "SR-120", "121": "SR-121", "123": "SR-123", "124": "SR-124",
    "125": "SR-125", "126": "SR-126", "127": "SR-127", "128": "SR-128", "129": "SR-129",
    "130": "SR-130", "131": "SR-131", "132": "SR-132", "133": "SR-133", "134": "SR-134",
    "135": "SR-135", "136": "SR-136", "137": "SR-137", "138": "SR-138", "139": "SR-139",
    "140": "SR-140", "142": "SR-142", "144": "SR-144", "145": "SR-145", "146": "SR-146",
    "147": "SR-147", "149": "SR-149", "150": "SR-150", "151": "SR-151", "152": "SR-152",
    "153": "SR-153", "154": "SR-154", "155": "SR-155", "156": "SR-156", "158": "SR-158",
    "160": "SR-160", "161": "SR-161", "162": "SR-162", "163": "SR-163", "164": "SR-164",
    "165": "SR-165", "166": "SR-166", "167": "SR-167", "168": "SR-168", "169": "SR-169",
    "170": "SR-170", "172": "SR-172", "173": "SR-173", "174": "SR-174", "175": "SR-175",
    "177": "SR-177", "178": "SR-178", "180": "SR-180", "182": "SR-182", "183": "SR-183",
    "184": "SR-184", "185": "SR-185", "186": "SR-186", "187": "SR-187", "188": "SR-188",
    "189": "SR-189", "190": "SR-190", "191": "SR-191", "192": "SR-192", "193": "SR-193",
    "197": "SR-197", "198": "SR-198", "200": "SR-200", "201": "SR-201", "202": "SR-202",
    "203": "SR-203", "204": "SR-204", "207": "SR-207", "210": "SR-210", "211": "SR-211",
    "213": "SR-213", "216": "SR-216", "217": "SR-217", "218": "SR-218", "219": "SR-219",
    "220": "SR-220", "221": "SR-221", "222": "SR-222", "223": "SR-223", "227": "SR-227",
    "229": "SR-229", "232": "SR-232", "233": "SR-233", "236": "SR-236", "237": "SR-237",
    "238": "SR-238", "241": "SR-241", "242": "SR-242", "243": "SR-243", "244": "SR-244",
    "245": "SR-245", "246": "SR-246", "247": "SR-247", "253": "SR-253", "254": "SR-254",
    "255": "SR-255", "259": "SR-259", "260": "SR-260", "261": "SR-261", "262": "SR-262",
    "263": "SR-263", "265": "SR-265", "266": "SR-266", "267": "SR-267", "269": "SR-269",
    "270": "SR-270", "271": "SR-271", "273": "SR-273", "275": "SR-275", "281": "SR-281",
    "282": "SR-282", "283": "SR-283", "284": "SR-284", "299": "SR-299", "330": "SR-330",
    "371": "SR-371", "780": "SR-780", "905": "SR-905", "6": "US-6", "50": "US-50",
    "95": "US-95", "97": "US-97", "101": "US-101", "199": "US-199", "395": "US-395"
}

# Generate extended mapping to include leading zeros
road_mapping = {}
for key, value in original_mapping.items():
    road_mapping[key] = value  # Original
    road_mapping[key.zfill(2)] = value  # 2-character zero-padded
    road_mapping[key.zfill(3)] = value  # 3-character zero-padded

#print(road_mapping)

In [4]:
gcs_path = "gs://calitp-analytics-data/data-analyses/big_data/freight/all_permits/"

In [5]:
file_names = ["all_permits_2023_sampleset.xlsx",
              "all_permits_2024_sampleset.xlsx"]

In [6]:
# Step 1
def load_excel_sheets_1(gcs_path, file_names):
    """
    Pull in the first sheet from each Excel file in GCS, add a 'year' column based on the filename,
    and remove records with NaN values in the 'permitnumber' column. Returns a concatenated DataFrame
    with data from all files.

    Parameters:
    gcs_path (str): The Google Cloud Storage path where the files are located.
    file_names (list): A list of Excel file names in the GCS path.

    Returns:
    pd.DataFrame: A single concatenated DataFrame with data from all files, a 'year' column, and
                  records with NaN values in 'permitnumber' removed.
    """
    
    # Create a Google Cloud Storage file system object
    fs = gcsfs.GCSFileSystem()
    
    # List to store all DataFrames
    df_list = []
    
    # Suppress any warnings
    warnings.filterwarnings("ignore")
    
    # Suppress the specific UserWarning
    warnings.filterwarnings(
        "ignore",
        message="Your application has authenticated using end user credentials from Google Cloud SDK without a quota project.",
        category=UserWarning,
        module="google.auth._default"
    )
    
    # Define the columns to keep
    columns_to_keep = ['permitnumber', 'year', 'permitvalidfrom', 'permitvalidto', 
                       'loaddescription', 'origin', 'destination', 'authorizedhighways']
    
    # Loop through each file in the file list
    for file in file_names:
        # Extract the year from the filename
        year = file.split('_')[2]  # Assuming the year is the third element when split by '_'
        
        # Open the file and read only the first sheet
        with fs.open(f"{gcs_path}{file}", 'rb') as f:
            df = pd.read_excel(f, sheet_name=0)  # Load only the first sheet
        
        # Clean headers by removing spaces and making characters lowercase
        df.columns = [col.replace(" ", "").lower() for col in df.columns]
        
        # Add 'year' column
        df['year'] = year
        
        # Filter columns and remove rows with NaN in 'permitnumber'
        df = df[columns_to_keep].dropna(subset=['permitnumber'])
        
        # Append to list
        df_list.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    final_df = pd.concat(df_list, ignore_index=True)
    
    return final_df






# Parsing function to create individual route locations
# def parse_routes(route_info):
#     segments = []

#     # Split the data by "from", "to", or standalone dash patterns
#     raw_segments = re.split(r'\s*-\s*from\s+|\s*-\s*to\s+|(?<!\s)-\s*', route_info)

#     # Process each segment
#     for segment in raw_segments:
#         # Split by ' - ' or '-'
#         sub_segments = re.split(r'\s*-\s*|\s*-\s*', segment)
        
#         # Clean and add sub-segments
#         segments.extend([sub.strip() for sub in sub_segments if sub.strip()])

#     return segments

def parse_routes(route_info):
    segments = []

    # Clean bad embedded route formats like "morn10w" → "10"
    if isinstance(route_info, str):
        # This regex pulls out any number between non-digit characters and isolates it
        route_info = re.sub(r'\b[a-z]*?(\d{1,3})[a-z]*\b', r'\1', route_info.lower())

    # Now split the data by "from", "to", or dash patterns
    raw_segments = re.split(r'\s*-\s*from\s+|\s*-\s*to\s+|(?<!\s)-\s*', route_info)

    for segment in raw_segments:
        sub_segments = re.split(r'\s*-\s*|\s*-\s*', segment)
        segments.extend([sub.strip() for sub in sub_segments if sub.strip()])

    return segments






# Custom parsing function
def extract_location(text):
    # Stop keywords pattern
    stop_keywords = r"\b(?:dr|drive|rd|ave|way|pkwy|parkway|skyway|road|avenue|blvd|boulevard|st|street|line|lane|ln|hwy|highway)\b"

    # If 'from' exists, process it as before
    if "from" in text.lower():
        match = re.search(r"from\s+(`.*?`|'.*?'|\w+(?:\s+\w+)*)", text, re.IGNORECASE)
        if match:
            location = match.group(1)  # Extract the text after "from"
            # Keep the stop keywords and remove everything after them
            location = re.sub(r"(" + stop_keywords + r").*", r"\1", location, flags=re.IGNORECASE).strip()
            return location
    else:
        # If 'from' doesn't exist, look for a stop keyword and capture location
        match = re.search(r"(`.*?`|'.*?'|\w+(?:\s+\w+)*)\s+(" + stop_keywords + r")", text, re.IGNORECASE)
        if match:
            location = match.group(1)  # Capture location before stop keyword
            return location.strip()

    return None  # If no match is found


# Function to clean each string 
def clean_route(route):
    if not isinstance(route, str):  # Handle non-string entries
        return route
    
    # Define stop keywords regex (including 'exit')
    stop_keywords = r"\b(?:dr|drive|rd|ave|way|pkwy|parkway|skyway|road|avenue|blvd|boulevard|st|street|line|lane|ln|hwy|highway|exit)\b"

    # Extract numeric highway number with optional "rte" or "route" prefixes
    numeric_match = re.match(r"(?:rte|route)?\s*(\d+)", route, flags=re.IGNORECASE)
    if numeric_match:
        return numeric_match.group(1)  # Return the numeric value
    
    # Extract street name up to and including the first stop keyword (but excluding the stop word itself)
    street_match = re.search(rf"(.*?\b{stop_keywords}(?!\s*exit)\b)", route, flags=re.IGNORECASE)
    if street_match:
        return street_match.group(1).strip()  # Return the street name up to the stop keyword (excluding 'exit')
    
    # If neither is found, return the original string
    return route

# A function to parse the ['authorizedhighways'] column to get the route information
def process_route_locations(df, parse_routes, extract_location, clean_route, road_mapping):
    """
    Process a dataframe to format and clean route-related columns.
    
    Parameters:
    df (pandas.DataFrame): The input dataframe to process.
    parse_routes (function): Function to parse the 'authorizedhighways' column into route segments.
    extract_location (function): Function to extract location from a route segment.
    clean_route (function): Function to clean individual route location entries.
    road_mapping (dict): Mapping dictionary for road numbers to their corresponding classes.

    Returns:
    pandas.DataFrame: The processed dataframe with updated route locations and columns.
    """

    # Format the authorized highways field so the text is not all capitalized
    df['authorizedhighways'] = df['authorizedhighways'].str.capitalize()

    # Apply the parsing function to create lists of individual route locations
    df['route_segments'] = df['authorizedhighways'].apply(parse_routes)

    # Determine the maximum number of locations to create the necessary columns
    max_locations = df['route_segments'].apply(len).max()

    # Create new columns for each route location based on the maximum number of locations
    for i in range(max_locations):
        df[f'route_location_{i}'] = df['route_segments'].apply(lambda x: x[i] if i < len(x) else None)

    # Drop the temporary route_segments column
    df.drop(columns=['route_segments'], inplace=True)

    # Add a new column with all values set to "California"
    df.insert(5, "state", "California")  # Index 5 corresponds to the 6th column position

    # Apply title case to the 'origin' and 'destination' columns
    df['origin'] = df['origin'].str.title()
    df['destination'] = df['destination'].str.title()

    # Apply the extract_location function to the column
    df["route_location_start"] = df["route_location_0"].apply(extract_location)

    # Insert the route_location_start column into the 8th position
    df.insert(8, "route_location_start", df.pop("route_location_start"))

    # Drop the [authorizedhighways] column
    #df.drop(columns=['authorizedhighways'], inplace=True)

    # Drop the route_location_0 field
    df = df.drop(columns=['route_location_0'])

    # Identify target columns excluding "route_location_start"
    route_columns = [col for col in df.columns if col.startswith("route_location_") and col != "route_location_start"]

    # Apply the clean_route cleaning function to the target columns (columns that begin with the words "route_location")
    for col in route_columns:
        df[col] = df[col].apply(clean_route)

    # Iterate through each "route_location_" column to remove the word "exit"
    for col in route_columns:
        df[col] = df[col].apply(lambda x: str(x).replace("exit", "").strip() if isinstance(x, str) else x)

    for col in route_columns:
        df[col] = df[col].astype(str)

    # Update the road numbers to their corresponding road class numbers
    for col in route_columns:
        df[col] = df[col].astype(str).map(road_mapping).fillna(df[col])  # Keep original value if no mapping found

    # Create a new field called 'route_location_origin' that identifies the street and city/state
    df['route_location_origin'] = df['route_location_start'] + " " + df['origin'] + ", " + df['state']

    # Move the new column (['route_location_origin_0']) to the 9th position
    columns = list(df.columns)
    columns.insert(9, columns.pop(columns.index('route_location_origin')))
    df = df[columns]
    
    

    # Remove the 'route_location_destination_city' column if it exists
    if 'route_location_start' in df.columns:
        df = df.drop(columns=['route_location_start'])
    
    return df











# A function to create the ['route_intersection_x'] columns
def process_route_intersections(df):
    """
    Process a dataframe to identify and process route location and intersection columns.
    
    Parameters:
    df (pandas.DataFrame): Input dataframe to process.
    
    Returns:
    pandas.DataFrame: A cleaned dataframe with processed route intersections.
    """

    # Identify all columns with "route_location_" prefix
    route_location_cols = [col for col in df.columns if col.startswith("route_location_")]

    # Initialize a counter for the new intersection column names
    intersection_counter = 0

    # Create new columns for intersections
    for i in range(len(route_location_cols) - 1):
        col1 = route_location_cols[i]
        col2 = route_location_cols[i + 1]

        # Name the new intersection column based on the counter
        intersection_col = f"route_intersection_{intersection_counter}"

        # Combine adjacent columns into one field (handle None gracefully)
        df[intersection_col] = df[col1].astype(str) + " and " + df[col2].astype(str)
        df[intersection_col] = df[intersection_col].replace("None and None", None)  # Optional cleanup for all-None rows
        intersection_counter += 1

    # Identify columns that start with "route_intersection_"
    intersection_cols = [col for col in df.columns if col.startswith("route_intersection_")]

    # Iterate over each intersection column
    for col in intersection_cols:
        # Replace values ending with " & None" with None (Null)
        df[col] = df[col].apply(lambda x: None if isinstance(x, str) and x.endswith(" and None") else x)

    # Identify columns that start with "route_intersection_"
    intersection_cols = [col for col in df.columns if col.startswith("route_intersection_")]

    # Iterate over each intersection column to remove leading zeros from numeric values
    for col in intersection_cols:
        # Apply the transformation to each value in the column
        df[col] = df[col].apply(lambda x: ' and '.join([part.lstrip('0') if part.isdigit() else part for part in str(x).split(' and ')]) if isinstance(x, str) else x)

    # Create a list(?) called core_columns to be included in the next iteration of the dataframe
    core_columns = [
        "permitnumber", "year", "permitvalidfrom", "permitvalidto",
        "loaddescription", "state", "origin", "destination", "authorizedhighways", "route_location_origin"
    ]

    # subset_columns combines the core columns with the intersection_cols identified earlier in the script
    subset_columns = core_columns + intersection_cols

    # this next line utilizes the defined subset_columns to create a cleaned up version of the dataframe including only the columns needed for this analysis 
    df = df[subset_columns]

    # Replace None values in intersection_cols with empty strings
    for col in intersection_cols:
        df[col] = df[col].apply(lambda x: "" if x is None else x)
    
    return df

# Function to get the last 'route_intersection_x' field
def get_last_intersection(row):
    # Identify columns that match the pattern 'route_intersection_x'
    intersection_columns = [col for col in df.columns if col.startswith('route_intersection_')]
    # Get the last non-null value among these columns
    return row[intersection_columns].dropna().iloc[-1] if intersection_columns else None

def add_route_location_destination_city(df):
    # Function to get the last 'route_intersection_x' value
    def get_last_intersection(row):
        # Identify columns that match the pattern 'route_intersection_x'
        intersection_columns = [col for col in df.columns if col.startswith('route_intersection_')]
        # Get the last non-null value among these columns
        return row[intersection_columns].dropna().iloc[-1] if len(intersection_columns) > 0 else None

    # Create the new column
    df['route_location_destination_city'] = df.apply(
        lambda row: f"{get_last_intersection(row)} {row['destination']}, {row['state']}", axis=1
    )

    return df

def create_route_intersection_last(df):
    """
    Create a 'route_intersection_last' column to capture the last non-null value
    from all 'route_intersection_x' columns.

    Parameters:
    df (pandas.DataFrame): The dataframe to process.

    Returns:
    pandas.DataFrame: The dataframe with the new 'route_intersection_last' column.
    """
    # Identify all 'route_intersection_x' columns
    intersection_columns = [col for col in df.columns if col.startswith('route_intersection_')]

    if not intersection_columns:
        raise ValueError("No 'route_intersection_' columns found in the dataframe.")

    # Ensure the columns are processed in order
    intersection_columns = sorted(intersection_columns, key=lambda x: int(x.split('_')[-1]))

    # Create 'route_intersection_last' by finding the last non-null value row-wise
    df['route_intersection_last'] = df[intersection_columns].apply(
        lambda row: next((val for val in reversed(row) if pd.notnull(val) and val != ''), None), axis=1
    )

    return df



def create_route_intersection_destination(df):
    """
    Create a new column 'route_intersection_destination' by combining 'route_intersection_last' 
    and 'route_location_destination_city'. Then, clean up the column by removing extra whitespace.
    
    Parameters:
    df (pandas.DataFrame): The input dataframe to process.
    
    Returns:
    pandas.DataFrame: The dataframe with the new 'route_intersection_destination' column and cleaned columns.
    """
    # Create the new 'route_intersection_destination' column
    df['route_intersection_destination'] = df['route_intersection_last'] + " " + df['route_location_destination_city']
    
    # Remove extra spaces by stripping and ensuring only single spaces exist
    df['route_intersection_destination'] = df['route_intersection_destination'].apply(
        lambda x: " ".join(x.split()) if isinstance(x, str) else x
    )

    # Drop unnecessary columns if they exist
    df.drop(columns=['route_location_destination_city', 'route_intersection_last'], errors='ignore', inplace=True)

    return df



# A function to clean "route_intersection" columns
def clean_route_intersections(df):
    # Find all columns starting with "route_intersection_"
    intersection_columns = [col for col in df.columns if col.startswith("route_intersection_")]
    
    # Replace variations of "imperial highway" (e.g., "imperial hwy") with "SR-99"
    for col in intersection_columns:
        df[col] = df[col].str.replace(
            r"(?i)\bimperial (highway|hwy)\b", "SR-99", regex=True
        )  # Matches "imperial highway" or "imperial hwy"
    
    return df








def process_workflow(gcs_path, file_names):
    # Load the Excel sheets from GCS and get the initial DataFrame
    df = load_excel_sheets_1(gcs_path, file_names)
    
    # Process route locations
    df = process_route_locations(df, parse_routes, extract_location, clean_route, road_mapping)
       
    # Process route intersections
    df = process_route_intersections(df)
    
    # Add route location destination city
    df = add_route_location_destination_city(df)
    
    # Create route intersection last
    df = create_route_intersection_last(df)
    
    # Create route intersection destination
    df = create_route_intersection_destination(df)

    # this is a new script - intended to help clean up the ["route_locations_x"] before they become intersections
    df = clean_route_intersections(df)
    
    
    # # Replace the last non-null route intersection with destination
    # df = replace_last_non_null_intersection(df)
    
    return df




In [7]:
df = process_workflow(gcs_path, file_names)

In [8]:
# Find all columns starting with "route_intersection_"
intersection_columns = [col for col in df.columns if col.startswith("route_intersection_")]


def normalize_intersection(intersection_columns):
    """
    Normalizes intersections by ordering highway identifiers numerically.
    If both sides of the intersection are highways (I-, SR-, or US-),
    ensures the smaller-numbered highway appears first.

    Args:
        intersection (str): The intersection string in the format "Location1 and Location2".

    Returns:
        str: The normalized intersection string, or the original string if no changes are needed.
    """

    
    
    if not intersection_columns or pd.isna(intersection_columns):
        return intersection_columns

    parts = [part.strip() for part in intersection_columns.split("and")]
    if len(parts) != 2:
        return intersection_columns  # Return as-is if not exactly two parts

    pattern = r"^(I-|SR-|US-)(\d+)$"  # Pattern to match highway identifiers

    match1 = re.match(pattern, parts[0])
    match2 = re.match(pattern, parts[1])

    if match1 and match2:
        # Extract numeric portions and compare
        num1 = int(match1.group(2))
        num2 = int(match2.group(2))

        if num1 > num2:
            # Swap to ensure the smaller number comes first
            parts = [parts[1], parts[0]]

    return " and ".join(parts)


def process_intersections(df, intersection_columns):
    """
    Processes and normalizes intersections in specified columns of a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns (list): List of column names to process.

    Returns:
        pd.DataFrame: The updated DataFrame with normalized intersections.
    """
    for col in intersection_columns:
        if col in df.columns:
            df[col] = df[col].apply(normalize_intersection)
    return df


def clean_intersections_1(df):
    """
    Cleans the 'route_intersection_0' column in the DataFrame by replacing specific values.
    
    - Replaces values that start with "az line" and end with "I-40" with 
      "colorado river bridge and I-10, California".
    
    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The updated DataFrame with cleaned intersections.
    """
    # Check if the column exists
    if "route_intersection_0" in df.columns:
        # Debugging: Display the initial state of the column
        #print("Before cleaning:")
        #print(df["route_intersection_0"].head())
        
        # Define the replacement logic
        def replace_intersection(value):
            if isinstance(value, str) and value.lower().startswith("az line") and value.endswith("I-40"):
                return "colorado river bridge and I-10, California"
            return value

        # Apply the function to clean the column
        df["route_intersection_0"] = df["route_intersection_0"].apply(replace_intersection)
        
        # Debugging: Display the updated state of the column
        #print("After cleaning:")
        #print(df["route_intersection_0"].head())
    else:
        print("Column 'route_intersection_0' does not exist in the DataFrame.")

    return df



def update_route_intersection(df):
    # Create a new column with the same values as ['route_intersection_0']
    df['route_intersection_origin'] = df['route_intersection_0']
    
    # Define replacement mappings
    replacements = {
        "colorado river bridge and I-10, California": "I-10 and Arizona Line"
    }
    
    # Pattern-based replacements
    pattern_replacements = [
        (r'^az line.*and I-10$', "I-10 and Arizona Line"),
        (r'^az line.*and I-8$', "I-8 and Arizona Line"),
        (r'^az line.*and SR-62$', "SR-62 and Arizona Line"),
        (r'^or line.*and I-5$', "I-5 and Oregon Line"),
        (r'^or line.*and US-97$', "US-97 and Oregon Line"),
        (r'^or line.*and US-395$', "US-395 and Oregon Line"),
        (r'^or line.*and SR-139$', "SR-139 and Oregon Line"),
        (r'^nv line.*and I-80$', "I-80 and Nevada Line"),
        (r'^nv line.*and SR-15$', "I-15 and Nevada Line"),
        (r'^nv line.*and US-6$', "US-6 and Nevada Line"),
        (r'^nv line.*and US-395$', "US-395 and Nevada Line"),
        (r'^nv line.*and SR-178$', "SR-178 and Nevada Line")
    ]
    
    # Apply direct replacements
    df['route_intersection_origin'] = df['route_intersection_origin'].replace(replacements)
    
    # Apply pattern-based replacements
    for pattern, replacement in pattern_replacements:
        df.loc[df['route_intersection_origin'].str.match(pattern, case=False, na=False), 
               'route_intersection_origin'] = replacement
    
    # Place the new column in the 10th position
    cols = list(df.columns)
    cols.insert(10, cols.pop(cols.index('route_intersection_origin')))
    df = df[cols]
    
    # Remove the original column
    df.drop(columns=['route_intersection_0'], inplace=True)
    
    return df


def update_route_intersection_destination(df):
    # Create a new column with the same values as ['route_intersection_destination']
    df['route_intersection_destination_1'] = df['route_intersection_destination']
    
    # Define replacement mappings
    replacements = {
        "I-40 and az line": "I-40 and Arizona Line",
        "I-10 and az line": "I-10 and Arizona Line",
        "I-8 and az line": "I-8 and Arizona Line",
        "SR-62 and az line": "SR-62 and Arizona Line",
        "I-80 and nv line": "I-80 and Nevada Line",
        "I-5 and or line": "I-5 and Oregon Line",
        "SR-15 and nv line": "SR-15 and Nevada Line",
        "I-15 and nv line": "I-15 and Nevada Line"
    }
    
    # Pattern-based replacements
    pattern_replacements = [
        (r'I-40 and az line', "I-40 and Arizona Line"),
        (r'I-10 and az line', "I-10 and Arizona Line"),
        (r'I-8 and az line', "I-8 and Arizona Line"),
        (r'SR-62 and az line', "SR-62 and Arizona Line"),
        (r'I-80 and nv line', "I-80 and Nevada Line"),
        (r'I-5 and or line', "I-5 and Oregon Line"),
        (r'SR-15 and nv line', "I-15 and Nevada Line"),
        (r'I-15 and nv line', "I-15 and Nevada Line")
    ]
    
    # Apply direct replacements
    df['route_intersection_destination_1'] = df['route_intersection_destination_1'].replace(replacements)
    
    # Apply pattern-based replacements
    for pattern, replacement in pattern_replacements:
        df.loc[df['route_intersection_destination_1'].str.contains(pattern, case=False, na=False), 
               'route_intersection_destination_1'] = replacement
    
    # Drop the original column
    df.drop(columns=['route_intersection_destination'], inplace=True)
    
    # Rename the new column
    df.rename(columns={'route_intersection_destination_1': 'route_intersection_destination'}, inplace=True)
    
    return df




In [9]:
def process_workflow_step2(df):
    df = process_intersections(df, intersection_columns)
    df = clean_intersections_1(df)
    df = update_route_intersection(df)
    df = update_route_intersection_destination(df)
    
    return df
    
    

In [10]:
df = process_workflow_step2(df)

In [11]:
def add_intersection_coordinates(df, origin_intersections, destination_intersections):
    """
    Adds x_coords and y_coords for origin and destination intersections.
    
    Parameters:
    df (pd.DataFrame): The input dataframe.
    origin_intersections (list of dict): List of origin intersection dictionaries.
    destination_intersections (list of dict): List of destination intersection dictionaries.
    
    Returns:
    pd.DataFrame: Updated dataframe with added coordinate columns.
    """
    
    # Convert lists of dictionaries into dictionaries for quick lookup, handling missing keys safely
    origin_lookup = {
        d.get("origin_intersection", None): (d.get("x_coords", None), d.get("y_coords", None))
        for d in origin_intersections if "origin_intersection" in d
    }

    destination_lookup = {
        d.get("destination_intersection", None): (d.get("x_coords", None), d.get("y_coords", None))
        for d in destination_intersections if "destination_intersection" in d
    }

    #Extract origin coordinates
    df["route_intersection_origin_x_coords"] = df["route_intersection_origin"].map(
        lambda x: origin_lookup.get(x, (None, None))[0]
    )
    df["route_intersection_origin_y_coords"] = df["route_intersection_origin"].map(
        lambda x: origin_lookup.get(x, (None, None))[1]
    )

    # Insert new columns at position 11 and 12
    cols = list(df.columns)
    cols.insert(11, cols.pop(cols.index("route_intersection_origin_x_coords")))
    cols.insert(12, cols.pop(cols.index("route_intersection_origin_y_coords")))
    df = df[cols]

    # Extract destination coordinates
    df["route_intersection_destination_x_coords"] = df["route_intersection_destination"].map(
        lambda x: destination_lookup.get(x, (None, None))[0]
    )
    df["route_intersection_destination_y_coords"] = df["route_intersection_destination"].map(
        lambda x: destination_lookup.get(x, (None, None))[1]
    )

    # Move the new destination coordinate columns to be right after the last column
    last_col_index = df.columns.get_loc("route_intersection_destination") + 1
    cols = list(df.columns)
    cols.insert(last_col_index, cols.pop(cols.index("route_intersection_destination_x_coords")))
    cols.insert(last_col_index + 1, cols.pop(cols.index("route_intersection_destination_y_coords")))
    df = df[cols]

    return df

In [12]:
df = add_intersection_coordinates(df, origin_intersections, destination_intersections)

In [13]:
def add_mid_route_coordinates(df):
    """
    Adds 'x_coords' and 'y_coords' fields next to each mid-route intersection 
    and moves all route_intersection-related columns to the end.
    
    Parameters:
    df (pd.DataFrame): Input dataframe with multiple route_intersection_X fields.
    
    Returns:
    pd.DataFrame: Updated dataframe with coordinate fields added next to each intersection.
    """

    # Create a lookup dictionary for SHS intersections (uppercase, removing ", California")
    shs_lookup = {
        d.get("shs_intersection", "").strip().upper().replace(", CALIFORNIA", ""): 
        (d.get("x_coords", None), d.get("y_coords", None))
        for d in shs_intersections if "shs_intersection" in d
    }

    # Identify mid-route intersection columns (excluding origin and destination)
    mid_route_cols = sorted(
        [col for col in df.columns if col.startswith("route_intersection_") and col.split("_")[-1].isdigit()],
        key=lambda x: int(x.split("_")[-1])  # Sort numerically
    )

    # Format DataFrame values for lookup (uppercase + remove double spaces)
    for col in mid_route_cols:
        df[col] = df[col].astype(str).str.strip().str.upper().replace(" ,", ",").replace("  ", " ")

    # Add new coordinate columns (convert df values to uppercase for correct lookup)
    for col in mid_route_cols:
        formatted_col = df[col].apply(lambda x: x.replace(", CALIFORNIA", "").strip().upper() if isinstance(x, str) else None)
        df[f"{col}_x_coords"] = formatted_col.apply(lambda x: shs_lookup.get(x, (None, None))[0] if x in shs_lookup else None)
        df[f"{col}_y_coords"] = formatted_col.apply(lambda x: shs_lookup.get(x, (None, None))[1] if x in shs_lookup else None)

    # Define route-related columns to move to the end
    route_cols = []
    for col in mid_route_cols:
        route_cols.append(col)
        route_cols.append(f"{col}_x_coords")
        route_cols.append(f"{col}_y_coords")

    # Add origin and destination fields to the route-related list
    route_cols = (
        ["route_intersection_origin", "route_intersection_origin_x_coords", "route_intersection_origin_y_coords"]
        + route_cols
        + ["route_intersection_destination", "route_intersection_destination_x_coords", "route_intersection_destination_y_coords"]
    )

    # Identify non-route columns (to keep at the front)
    non_route_cols = [col for col in df.columns if col not in route_cols]

    # Ensure all columns exist before reordering
    final_col_order = non_route_cols + [col for col in route_cols if col in df.columns]
    df = df[final_col_order]

    return df

In [14]:
df = add_mid_route_coordinates(df)

In [15]:
#df.to_csv("osow_vehicle_permits_authorizedhighways_cv.csv", index=False)

In [16]:
def update_and_remove_destination_fields(df):
    """
    Update the last available intersection's coordinates with the destination coordinates,
    then remove the destination fields.

    Parameters:
        df (pd.DataFrame): DataFrame containing route intersection fields.

    Returns:
        pd.DataFrame: Updated DataFrame with destination fields removed.
    """
    for index, row in df.iterrows():
        last_intersection_col = None

        # Identify the last non-null intersection before the destination
        for i in range(1, 24):  # Assuming up to 23 intermediate intersections
            intersection_col = f'route_intersection_{i}'
            x_col = f'route_intersection_{i}_x_coords'
            y_col = f'route_intersection_{i}_y_coords'

            if pd.notna(row.get(intersection_col)) and row[intersection_col].strip():
                last_intersection_col = (intersection_col, x_col, y_col)

        # If a last intersection exists, update its coordinates with destination coordinates
        if last_intersection_col:
            _, last_x_col, last_y_col = last_intersection_col
            dest_x_col = 'route_intersection_destination_x_coords'
            dest_y_col = 'route_intersection_destination_y_coords'

            if pd.notna(row.get(dest_x_col)) and pd.notna(row.get(dest_y_col)):
                df.at[index, last_x_col] = row[dest_x_col]
                df.at[index, last_y_col] = row[dest_y_col]

    # Drop the destination fields from the dataset
    df = df.drop(columns=['route_intersection_destination', 
                          'route_intersection_destination_x_coords', 
                          'route_intersection_destination_y_coords'], errors='ignore')

    return df

In [17]:
df = update_and_remove_destination_fields(df)

In [18]:
#df.to_csv("osow_vehicle_permits_authorizedhighways_cv_v1.csv", index=False)

In [19]:
def clean_and_shift_intersections(df):
    """
    Cleans route intersection fields by:
    1. Removing intersections that lack x_coords and y_coords.
    2. Shifting remaining values left to fill gaps.
    3. Removing columns that are empty after shifting.

    Parameters:
        df (pd.DataFrame): DataFrame containing route intersection fields.

    Returns:
        pd.DataFrame: Cleaned DataFrame.
    """
    max_intersections = 23  # Max expected number of intersections

    for index, row in df.iterrows():
        cleaned_intersections = []  # Store valid intersections

        # Collect valid intersections (non-null x_coords and y_coords)
        for i in range(1, max_intersections + 1):
            name_col = f'route_intersection_{i}'
            x_col = f'route_intersection_{i}_x_coords'
            y_col = f'route_intersection_{i}_y_coords'

            if pd.notna(row.get(name_col)) and pd.notna(row.get(x_col)) and pd.notna(row.get(y_col)):
                cleaned_intersections.append(
                    (row[name_col], row[x_col], row[y_col])
                )

        # Clear existing values
        for i in range(1, max_intersections + 1):
            df.at[index, f'route_intersection_{i}'] = None
            df.at[index, f'route_intersection_{i}_x_coords'] = None
            df.at[index, f'route_intersection_{i}_y_coords'] = None

        # Re-populate with shifted values
        for i, (name, x, y) in enumerate(cleaned_intersections, start=1):
            df.at[index, f'route_intersection_{i}'] = name
            df.at[index, f'route_intersection_{i}_x_coords'] = x
            df.at[index, f'route_intersection_{i}_y_coords'] = y

    # Drop empty intersection columns
    for i in range(1, max_intersections + 1):
        name_col = f'route_intersection_{i}'
        x_col = f'route_intersection_{i}_x_coords'
        y_col = f'route_intersection_{i}_y_coords'

        if df[name_col].isna().all():  # Check if entire column is empty
            df.drop(columns=[name_col, x_col, y_col], inplace=True)

    return df

In [20]:
df = clean_and_shift_intersections(df)

In [21]:
# df.to_csv("osow_vehicle_permits_authorizedhighways_cv_v3.csv", index=False)
df.to_csv("osow_vehicle_permits_authorizedhighways_cv_v4.csv", index=False)

In [22]:
# check the shape of the data
# current shape == (2399, 67)
df.shape

(2399, 67)