# Matching the location of each TCR to contracts

We first start by reading the cleaned and matched TCRs for 2024 which are matched with contracts number. As well as Karins matching between bandel and number of hours of servicefönster.

## Förbindelser and BIS (for mapping Bandelnr <-> Kontrakt)

Load the Excel file containing the dictionary for bandel matching, and keep only relevant columns and bandel.

In [31]:
import pandas as pd

dictionary_file_path = "Förbindelselinje_2023_alla.xlsx"

# Read the entire dictionary into a DataFrame
dictionary_df = pd.read_excel(dictionary_file_path)

In [32]:
# Step 1: Group by 'BdlNr', 'Bandel', 'Plats_sign', 'Plats' and sum 'Banlangd'
grouped_by_plats = dictionary_df.groupby(['BdlNr', 'Bandel', 'Plats_sign', 'Plats'])['Banlangd'].sum().reset_index()

# Step 2: Group by 'BdlNr', 'Bandel', 'Forbind' and sum 'Banlangd'
grouped_by_forbind = dictionary_df.groupby(['BdlNr', 'Bandel', 'Forbind'])['Banlangd'].sum().reset_index()

# Step 3: Add 'Plats_sign' and 'Plats' columns with NaN to 'grouped_by_forbind' for consistency
grouped_by_forbind['Plats_sign'] = pd.NA
grouped_by_forbind['Plats'] = pd.NA

# Step 4: Add 'Forbind' column with NaN to 'grouped_by_plats' for consistency
grouped_by_plats['Forbind'] = pd.NA

# Step 5: Combine the two DataFrames using outer concatenation
combined_df = pd.concat([grouped_by_plats, grouped_by_forbind], ignore_index=True, sort=False)

# Step 6: Reorder columns for clarity
dictionary_df = combined_df[['BdlNr', 'Bandel', 'Plats_sign', 'Plats', 'Forbind', 'Banlangd']]

Load the excel file containing BIS information for mapping the bandel number with contract name. The file name is BIS_24_kontrakt_bandel_plats.xlsx and has sheet BIS 2024-01-09 with columns such as Bandel_nummer, UH_kontraktsområde.

In [33]:
# File and sheet details
excel_file_path = "BIS_24_kontrakt_bandel_plats.xlsx"
sheet_name = "BIS 2024-01-09"

# Load the Excel file
bis_df = pd.read_excel(excel_file_path, sheet_name=sheet_name)

# Step 1: Remove duplicates from the mapping
bandel_to_contract_map = bis_df[['Bandel_nummer', 'UH_kontraktsområde']].drop_duplicates()

# Step 2: Filter out rows where UH_kontraktsområde is NaN or 'Ingår inte i något kontrakt'
bandel_to_contract_map = bandel_to_contract_map[
    bandel_to_contract_map['UH_kontraktsområde'].notna() & 
    (bandel_to_contract_map['UH_kontraktsområde'] != 'Ingår inte i något kontrakt')
]

# Convert to a dictionary for fast lookups
bandel_contract_dict = bandel_to_contract_map.set_index('Bandel_nummer')['UH_kontraktsområde'].to_dict()

## Karin's bandelar

In [34]:
# Step 1: Load the Excel file containing service contracts for each bandel
#excel_file_path = "servicekontrakt_per_bandel_Abdou.xlsx"
excel_file_path = "more_servicekontrakt_per_bandel.xlsx"

#sheet_name = "uppdaterad"
sheet_name = "tid per bandel"

# Read the specific sheet 'T24' into a DataFrame
servicekontrakt_df = pd.read_excel(excel_file_path, sheet_name=sheet_name)

In [35]:
# remove rows where the third column is missing
servicekontrakt_df = servicekontrakt_df[servicekontrakt_df.iloc[:, 2].notna()]

In [36]:
# Parse the 'Bandel' column into two new columns 'Bandelnr' and 'Bandelnamn'
def parse_bandel(bandel):
    import re
    bandelnr_match = re.match(r'^(\d+(?:/\d+)*)', bandel)
    if bandelnr_match:
        bandelnr = bandelnr_match.group(0).replace('/', ', ')
        bandelnamn = bandel[len(bandelnr_match.group(0)):].strip()
    else:
        bandelnr = ''
        bandelnamn = bandel.strip()
    return pd.Series([bandelnr, bandelnamn])

# Apply the parsing function to create two new columns
servicekontrakt_df[['Bandelnr', 'Bandelnamn']] = servicekontrakt_df['Bandel'].apply(parse_bandel)

In [37]:
# parse the column Tidsperiod (20XX - 20YY) into two new columns 'Start_year' and 'End_year'
def parse_tidsperiod(tidsperiod):
    import re
    # first remove spaces in tidsperiod
    tidsperiod = tidsperiod.replace(' ', '')
    tidsperiod_match = re.match(r'^(\d{4})-(\d{4})$', tidsperiod)
    if tidsperiod_match:
        start_year = int(tidsperiod_match.group(1))
        end_year = int(tidsperiod_match.group(2))
    else:
        start_year = None
        end_year = None
    return pd.Series([start_year, end_year])

# Apply the parsing function to create two new columns
servicekontrakt_df[['Start_year', 'End_year']] = servicekontrakt_df['Tidsperiod'].apply(parse_tidsperiod)

In [38]:
# remove rows where start_year is 2024 or later
servicekontrakt_df_T23 = servicekontrakt_df[servicekontrakt_df['Start_year'] < 2024]

Let us add the corresponding distances (lengths) for the identified bandels using the dictionary.

In [39]:
import networkx as nx
import pandas as pd

# Global cache for lengths and the graph
langd_cache = {}
GLOBAL_GRAPH = None

# Step 1: Create a mapping from Plats_sign (full name) to Banlangd
station_length_lookup = dictionary_df.set_index('Plats_sign')['Banlangd'].to_dict()

### Utility Functions ###

def initialize_global_graph(dictionary_df):
    """Initialize the global graph once"""
    global GLOBAL_GRAPH
    if GLOBAL_GRAPH is None:
        bdl_df = dictionary_df[(dictionary_df['BdlNr'] >= 2) & (dictionary_df['BdlNr'] <= 990)]
        GLOBAL_GRAPH = nx.Graph()  # Undirected graph to simulate bidirectional connections
        for _, row in bdl_df.iterrows():
            if pd.notna(row['Forbind']):
                start, end = row['Forbind'].split('-')
                length = row['Banlangd']
                GLOBAL_GRAPH.add_edge(start.strip(), end.strip(), length=length)

def calculate_sum_langd(forbind_list, dictionary_df):
    if not forbind_list or forbind_list == '':
        return None
    
    # Check cache
    cache_key = (forbind_list)
    if cache_key in langd_cache:
        return langd_cache[cache_key]
    
    # Initialize global graph if not already done
    if GLOBAL_GRAPH is None:
        initialize_global_graph(dictionary_df)
    
    # Split and clean the forbind_list
    forbinds = [f.strip() for f in forbind_list.split(',')]
    stations = [station for forbind in forbinds for station in forbind.split('-')]
    first_station = stations[0]
    last_station = stations[-1]
    
    # Check if first and last stations are enclosed in parentheses
    include_first_station = not (first_station.startswith('(') and first_station.endswith(')'))
    include_last_station = not (last_station.startswith('(') and last_station.endswith(')'))
    
    # Remove parentheses for lookup in the graph
    first_station_cleaned = first_station.strip('()')
    last_station_cleaned = last_station.strip('()')
    
    # Check if stations exist in graph
    if first_station_cleaned not in GLOBAL_GRAPH or last_station_cleaned not in GLOBAL_GRAPH:
        langd_cache[cache_key] = None
        return None
    
    try:
        path_length = nx.shortest_path_length(
            GLOBAL_GRAPH, 
            source=first_station_cleaned, 
            target=last_station_cleaned, 
            weight='length'
        )
        
        # Calculate length of intermediate stations
        shortest_path_stations = nx.shortest_path(
            GLOBAL_GRAPH, 
            source=first_station_cleaned, 
            target=last_station_cleaned
        )
        intermediate_stations = shortest_path_stations[1:-1]  # Exclude first and last station
        station_length_sum = sum(station_length_lookup.get(station, 0) for station in intermediate_stations)
        
        # Add lengths of first and last stations based on inclusion rules
        if include_first_station:
            station_length_sum += station_length_lookup.get(first_station_cleaned, 0)
        if include_last_station:
            station_length_sum += station_length_lookup.get(last_station_cleaned, 0)

        total_length = path_length + station_length_sum
        langd_cache[cache_key] = total_length
        return total_length
        
    except nx.NetworkXNoPath:
        langd_cache[cache_key] = None
        return None  # No path found

In [40]:
# import networkx as nx

# # Global cache for lengths
# langd_cache = {}

# # Step 1: Create a mapping from Plats_sign (full name) to Banlangd
# station_length_lookup = dictionary_df.set_index('Plats_sign')['Banlangd'].to_dict()

# ### Utility Functions ###

# # Function to build a bidirectional graph from the DataFrame
# def build_bidirectional_graph(dictionary_df, bdl_range):
#     # Filter for the given BdlNr range
#     bdl_df = dictionary_df[(dictionary_df['BdlNr'] >= bdl_range[0]) & (dictionary_df['BdlNr'] <= bdl_range[1])]
    
#     G = nx.Graph()  # Undirected graph to simulate bidirectional connections
#     for _, row in bdl_df.iterrows():
#         if pd.notna(row['Forbind']):
#             start, end = row['Forbind'].split('-')
#             length = row['Banlangd']
#             G.add_edge(start.strip(), end.strip(), length=length)  # Add bidirectional edges
#     return G


# def calculate_sum_langd(forbind_list, identified_bdlnr, dictionary_df):
#     if not forbind_list or forbind_list == '':
#         return None
    
#     # Check cache
#     cache_key = (forbind_list, identified_bdlnr)
#     if cache_key in langd_cache:
#         return langd_cache[cache_key]
    
#     # Split and clean the forbind_list
#     forbinds = [f.strip() for f in forbind_list.split(',')]
#     stations = [station for forbind in forbinds for station in forbind.split('-')]
#     first_station = stations[0]
#     last_station = stations[-1]
    
#     # Check if first and last stations are enclosed in parentheses
#     include_first_station = not (first_station.startswith('(') and first_station.endswith(')'))
#     include_last_station = not (last_station.startswith('(') and last_station.endswith(')'))
    
#     # Remove parentheses for lookup in the graph
#     first_station_cleaned = first_station.strip('()')
#     last_station_cleaned = last_station.strip('()')
    
#     # Build the smaller graph first
#     small_range = (identified_bdlnr, identified_bdlnr)
#     graph = build_bidirectional_graph(dictionary_df, small_range)
    
#     # Try pathfinding in the small graph
#     try:
#         if first_station_cleaned in graph and last_station_cleaned in graph:
#             path_length = nx.shortest_path_length(
#                 graph, source=first_station_cleaned, target=last_station_cleaned, weight='length'
#             )
            
#             # Calculate length of intermediate stations
#             shortest_path_stations = nx.shortest_path(graph, source=first_station_cleaned, target=last_station_cleaned)
#             intermediate_stations = shortest_path_stations[1:-1]  # Exclude first and last station
#             station_length_sum = sum(station_length_lookup.get(station, 0) for station in intermediate_stations)

#             # Add lengths of first and last stations based on inclusion rules
#             if include_first_station:
#                 station_length_sum += station_length_lookup.get(first_station_cleaned, 0)
#             if include_last_station:
#                 station_length_sum += station_length_lookup.get(last_station_cleaned, 0)

#             total_length = path_length + station_length_sum
#             langd_cache[cache_key] = total_length
#             return total_length
#     except nx.NetworkXNoPath:
#         pass  # Path not found, fall back to larger graph
    
#     # Build the larger graph if needed
#     large_range = (2,990)#(max(1, identified_bdlnr - 900), min(990, identified_bdlnr + 900))
#     graph = build_bidirectional_graph(dictionary_df, large_range)
    
#     # Check again in the larger graph
#     if first_station_cleaned not in graph or last_station_cleaned not in graph:
#         langd_cache[cache_key] = None
#         return None
    
#     try:
#         path_length = nx.shortest_path_length(
#             graph, source=first_station_cleaned, target=last_station_cleaned, weight='length'
#         )
        
#         # Calculate length of intermediate stations
#         shortest_path_stations = nx.shortest_path(graph, source=first_station_cleaned, target=last_station_cleaned)
#         intermediate_stations = shortest_path_stations[1:-1]  # Exclude first and last station
#         station_length_sum = sum(station_length_lookup.get(station, 0) for station in intermediate_stations)
        
#         # Add lengths of first and last stations based on inclusion rules
#         if include_first_station:
#             station_length_sum += station_length_lookup.get(first_station_cleaned, 0)
#         if include_last_station:
#             station_length_sum += station_length_lookup.get(last_station_cleaned, 0)

#         total_length = path_length + station_length_sum
#         langd_cache[cache_key] = total_length
#         return total_length
#     except nx.NetworkXNoPath:
#         langd_cache[cache_key] = None
#         return None  # No path found

In [41]:
#Create a mapping from Plats (full name) to Plats_sign (short code)
name_to_code_mapping = dictionary_df.set_index('Plats_sign')['Plats'].to_dict()

def convert_bandelnamn_to_codes(bandelnamn):
    # Split by dash and preserve parentheses
    stations = bandelnamn.split('-')
    
    # Create a case-insensitive mapping of full names to codes
    name_to_code_mapping_lower = {
        str(v).lower(): str(k) for k, v in name_to_code_mapping.items()
    }
    
    # Detailed conversion with original names preserved
    station_details = []
    station_codes = []

    for name in stations:
        stripped_name = str(name).strip()
        has_parentheses = stripped_name.startswith('(') and stripped_name.endswith(')')
        
        # Remove parentheses temporarily for lookup
        name_without_parentheses = stripped_name[1:-1] if has_parentheses else stripped_name
        
        # Try case-insensitive matching with original name
        code = name_to_code_mapping_lower.get(name_without_parentheses.lower())
        
        # If no code found, try appending " central"
        central_name = ""
        if code is None:
            central_name = f"{name_without_parentheses} central"
            code = name_to_code_mapping_lower.get(central_name.lower(), None)
        
        if code is None:
            central_name = f"{name_without_parentheses}s central"
            code = name_to_code_mapping_lower.get(central_name.lower(), None)
        
        station_details.append({
            'original_name': stripped_name,
            'tried_name': central_name if code and code != name_without_parentheses else None,
            'station_code': code
        })
        
        if code:
            # Add parentheses back if they were present
            formatted_code = f"({code})" if has_parentheses else code
            station_codes.append(formatted_code)
    
    return {
        'station_details': station_details,
        'station_codes': station_codes,
        'short_path': '-'.join(station_codes) if station_codes else None
    }


# Step 2: Prepare the dataframe with detailed conversion results
def prepare_bandelnamn_conversion(servicekontrakt_df):
    # Apply the conversion function
    conversion_results = servicekontrakt_df['Bandelnamn'].apply(convert_bandelnamn_to_codes)
    
    # Extract details into separate columns
    servicekontrakt_df = servicekontrakt_df.copy()
    servicekontrakt_df['station_details'] = conversion_results.apply(lambda x: x['station_details'])
    servicekontrakt_df['original_station_names'] = servicekontrakt_df['station_details'].apply(
        lambda x: [detail['original_name'] for detail in x]
    )
    servicekontrakt_df['station_codes'] = conversion_results.apply(lambda x: x['station_codes'])
    servicekontrakt_df['short_path'] = conversion_results.apply(lambda x: x['short_path'])
    
    return servicekontrakt_df


def calculate_sum_langd_for_bandelnamn(row, dictionary_df):

    row_bandel = None
    row_forbind = None
    if 'identified_BdlNr' in row and pd.notna(row['identified_BdlNr']): # for TCRs
        row_bandel = int(row['identified_BdlNr'])
        row_forbind = row['förbind_list']
    else: # for contracts
        row_bandel = int(row['Bandelnr'])
        row_forbind = row['short_path']

    # If short_path is None, return None
    if not row_forbind:
        return None

    # Case 1: Single station
    if '-' not in row_forbind:
        single_station = row_forbind
        
        # Find the row in dictionary_df where Plats_sign matches the single station
        matching_station = dictionary_df[dictionary_df['Plats_sign'] == single_station]
        
    
        # keep only rows where BdlNr is same as row['Bandelnr']
        matching_station = matching_station[matching_station['BdlNr'] == row_bandel]

        # If no matching station is found, return None
        if matching_station.empty:
            return None
        
        # Get the station's length as the sum of all values in matching_station['Banlangd']
        # station_length = matching_station['Banlangd'].iloc[0]
        station_length = matching_station['Banlangd'].sum()
        
        return station_length
        # # Find neighboring connections in the Forbind column (skip NaN values)
        # neighbors = dictionary_df[
        #     dictionary_df['Forbind'].notna() &  # Skip NaN values
        #     dictionary_df['Forbind'].str.contains(f"^{single_station}-|-{single_station}$", regex=True)
        # ]
        
        # # Calculate half the lengths of the neighboring connections
        # half_neighbor_lengths = 0
        # for _, neighbor in neighbors.iterrows():
        #     length = neighbor['Banlangd']
        #     half_neighbor_lengths += length / 2
        
        # # Return the total length: station's length + half the neighbors' lengths
        # return station_length + half_neighbor_lengths

    # Case 2: Multiple stations (existing logic)
    return calculate_sum_langd(row_forbind, dictionary_df)


# Apply the steps
# 1. First, prepare the conversion
servicekontrakt_df_T23 = prepare_bandelnamn_conversion(servicekontrakt_df_T23)

# remove rows from dictionary_df where BdlNr is 1
#dictionary_df = dictionary_df[dictionary_df['BdlNr'] != 1]

# 2. Then calculate sum_langd
servicekontrakt_df_T23['sum_langd'] = servicekontrakt_df_T23.apply(
    lambda row: calculate_sum_langd_for_bandelnamn(row, dictionary_df),
    axis=1
)

We can additionally identify the name of the contract area using bandel and mapping we have from BIS file.

In [42]:
# Step 3: Add 'kontrakt_från_bandel' column by mapping 'Bandel' to the cleaned dictionary
def map_bandel_to_contract(bandel, mapping_dict):
    # Extract the first Bandelnr if Bandelnr contains multiple values (e.g., "451, 452")
    first_bandel = int(bandel)
    return mapping_dict.get(first_bandel, None)

servicekontrakt_df_T23['kontrakt_från_bandel'] = servicekontrakt_df_T23['Bandelnr'].apply(
    lambda bandel: map_bandel_to_contract(bandel, bandel_contract_dict)
)

## Matching TCR:s Förbindelser with bandelar

Based on the column Från trafikplats, we can identify the corresponding bandel using dictionary.

In [43]:
# Load the CSV file containing matched TCRs for 2024
csv_file_path = "TCR_T23_matched.csv"

# Read the CSV file into a DataFrame
tcr_df = pd.read_csv(csv_file_path)

In [44]:
# Step 1: Create a mapping from 'Plats_sign' to 'BdlNr' for fast lookups
trafikplats_to_bandel_map = dictionary_df.set_index('Plats_sign')['BdlNr'].to_dict()

#  Create a mapping from 'Forbind' to 'BdlNr' for quick lookups
forbind_to_bdl_map = dictionary_df.set_index('Forbind')['BdlNr'].to_dict()

# replace in column Från trafikplats all values of Rus in tcr_df (not in dictionary) with Jho
tcr_df['Från trafikplats'] = tcr_df['Från trafikplats'].replace('Rus', 'Jho')
tcr_df['Från trafikplats'] = tcr_df['Från trafikplats'].replace('Ksc', 'Ks')
tcr_df['Från trafikplats'] = tcr_df['Från trafikplats'].replace('Sta', 'Äs')
tcr_df['Från trafikplats'] = tcr_df['Från trafikplats'].replace('Les', 'Alh')
# Tul - Söd cannot be connected because no forbind at Gau

# Step 2: Use .map() for vectorized lookup
tcr_df['identified_BdlNr'] = tcr_df['Från trafikplats'].map(trafikplats_to_bandel_map)


def fill_missing_bandel(df):
    # Create a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # For each TCR-id group
    for tcr_id in df['TCR-id'].unique():
        # Get rows for this TCR-id
        mask = df['TCR-id'] == tcr_id
        tcr_rows = df[mask]
        
        # Find rows with missing bandel
        missing_mask = tcr_rows['identified_BdlNr'].isna()
        missing_indices = tcr_rows[missing_mask].index
        
        # For each missing value
        for idx in missing_indices:
            current_seq = df.loc[idx, 'Platssekvensnummer']
            
            # Look for neighboring rows (same TCR-id, sequence number ±1)
            neighbors = tcr_rows[
                (tcr_rows['Platssekvensnummer'].isin([current_seq - 1, current_seq + 1]))
            ]
            
            # If we found any valid neighbors, use their bandel number
            valid_neighbors = neighbors[neighbors['identified_BdlNr'].notna()]
            if not valid_neighbors.empty:
                df.loc[idx, 'identified_BdlNr'] = valid_neighbors['identified_BdlNr'].iloc[0]
    
    return df

# Apply the function to fill missing bandel numbers
tcr_df = fill_missing_bandel(tcr_df)

There some rows where the bandel is not identified because the Från trafikplats is not in the dictionary. For these rows we will use the bandel that is identified in related row, i.e., rows with the same TCR-id and with Platssekvensnummer which is neighboring (i.e., Platssekvensnummer = Platssekvensnummer of the unidentified bandel row minus or plus 1).

In [45]:
# Group by TCR-id and Starttid
def get_first_last_rows(group):
    # Get min and max Platssekvensnummer rows
    first_row = group[group['Platssekvensnummer'] == group['Platssekvensnummer'].min()]
    last_row = group[group['Platssekvensnummer'] == group['Platssekvensnummer'].max()]
    return pd.concat([first_row, last_row])

# Apply the function to each group
filtered_tcr_df = tcr_df.groupby(['TCR-id', 'Starttid'], as_index=False).apply(get_first_last_rows)

# Reset index if needed
filtered_tcr_df = filtered_tcr_df.reset_index(drop=True)

  filtered_tcr_df = tcr_df.groupby(['TCR-id', 'Starttid'], as_index=False).apply(get_first_last_rows)


Before calculating the length between two consecutive places/rows, we need to reformat the tcr_df so that in each row we combine the row with the next row (in Platssekvensnummer), if any (until final row in the sequence).  We create a new column called förbind_list which will contactenate Från trafikplats of two consecutive rows, e.g., A-B (where A is trafikplats of the first row and B is the second), next row will have B-C, etc. until the final förbind in the sequence.

In [47]:
# Step 1: Sort the DataFrame by 'TCR-id' and 'Platssekvensnummer'
filtered_tcr_df = filtered_tcr_df.sort_values(by=['TCR-id', 'Starttid', 'Platssekvensnummer']).reset_index(drop=True)

# Step 4: Create 'next_trafikplats' and 'next_Från_inkluderad'
filtered_tcr_df['next_trafikplats'] = filtered_tcr_df.groupby(['TCR-id', 'Starttid'])['Från trafikplats'].shift(-1)
filtered_tcr_df['next_Från_inkluderad'] = filtered_tcr_df.groupby(['TCR-id', 'Starttid'])['Från inkluderad'].shift(-1)

# Step 5: Create 'förbind_list' with conditional parentheses
def format_trafikplats(trafikplats, inkluderad):
    """Format trafikplats name with parentheses based on inclusion status."""
    if inkluderad != 'Helt':
        return f"({trafikplats})"
    return trafikplats

def create_förbind(row):
    """Create förbind string for a row, connecting two trafikplats names."""
    if pd.isna(row['next_trafikplats']):
        return None

    if(row['Från trafikplats'] == row['next_trafikplats']):
        return f"{row['Från trafikplats']}"

    from_tp = format_trafikplats(row['Från trafikplats'], row['Från inkluderad'])
    to_tp = format_trafikplats(row['next_trafikplats'], row['next_Från_inkluderad'])


    return f"{from_tp}-{to_tp}"

# Apply the function to create förbind_list
filtered_tcr_df['förbind_list'] = filtered_tcr_df.apply(create_förbind, axis=1)

In [48]:
# Step 6: Remove temporary 'next_trafikplats' and 'next_Från_inkluderad' columns
#filtered_tcr_df = filtered_tcr_df.drop(columns=['next_trafikplats', 'next_Från_inkluderad'])

# Step 7: Remove the final row in each sequence
filtered_tcr_df = filtered_tcr_df.dropna(subset=['förbind_list']).reset_index(drop=True)

In [49]:
# keep a copy
tcr_df = filtered_tcr_df.copy()

We need to update the identified_BdlNr given the förbind_list. So, if the förbind_list is in the dictionary (column Forbind), and the corresponding BdlNr is different then the current identified_BdlNr, then update it. Otherwise leave it as it is.

In [50]:
# # Step 1: Create a mapping from 'Forbind' to 'BdlNr' for quick lookups
# forbind_to_bdl_map = dictionary_df.set_index('Forbind')['BdlNr'].to_dict()

In [51]:
# # Step 2: Update 'identified_BdlNr' based on 'förbind_list'
# def update_bandel(row):
#     # Check if 'förbind_list' exists in the dictionary
#     if row['förbind_list'] in forbind_to_bdl_map:
#         new_bdl_nr = forbind_to_bdl_map[row['förbind_list']]
#         # Update only if the new BdlNr is different
#         if new_bdl_nr != row['identified_BdlNr']:
#             return new_bdl_nr
    
#     # If not found, try the inverted link
#     inverted_link = '-'.join(reversed(row['förbind_list'].split('-')))
#     if inverted_link in forbind_to_bdl_map:
#         new_bdl_nr = forbind_to_bdl_map[inverted_link]
#         # Update only if the new BdlNr is different
#         if new_bdl_nr != row['identified_BdlNr']:
#             return new_bdl_nr
    
#     # If no update is needed or not found, return the current value
#     return row['identified_BdlNr']

# # Apply the function to update 'identified_BdlNr'
# tcr_df['identified_BdlNr'] = tcr_df.apply(update_bandel, axis=1)

Now, once we have identified BdlNR, we can use servicekontrakt_df to add a column with Kontraktsområdesnamn.

In [52]:
# Create the contract map with float conversion
contract_map = servicekontrakt_df_T23.drop_duplicates(subset=['Bandelnr']).copy()
contract_map['Bandelnr'] = contract_map['Bandelnr'].astype(float)
contract_map = contract_map.set_index('Bandelnr')['Kontraktsområdesnamn'].to_dict()

# Map with the contract map
tcr_df['identified_BdlNr'] = tcr_df['identified_BdlNr'].astype(float)
tcr_df['Kontraktsområdesnamn'] = tcr_df['identified_BdlNr'].map(contract_map)

We can also include a similar column with contract name, this one is based on BIS file.

In [53]:
# Additional mapping using the bandel_contract_dict
tcr_df['kontrakt_från_bandel'] = tcr_df['identified_BdlNr'].map(bandel_contract_dict)

No that we have identified BdlNr, we want to get the total length langd (which is in meter) of the Från trafikplats  and put it in a column (sum_langd). The idea is to use the order of forbind_list and look for the corresponding rows in dictionary_df (within same bandelnr = identified BdlNr) and accumulate the lenght in column dictionary_df(Banlangd). The forbind_list are normally linked, e.g., A-B, B-C, etc. 

In [54]:
# Apply the function to create the 'sum_langd' column for tcr_df
# tcr_df['sum_langd'] = tcr_df.apply(
#     lambda row: calculate_sum_langd(
#         row['förbind_list'], 
#         row['identified_BdlNr'], 
#         dictionary_df
#     ), 
#     axis=1
# )

tcr_df['sum_langd'] = tcr_df.apply(
    lambda row: calculate_sum_langd_for_bandelnamn(row, dictionary_df),
    axis=1
)

## Validations

In [55]:
# check that for each row of servicekontrakt_df_T23 the number of item in the list original_station_names is the same as in station_codes
servicekontrakt_df_T23['original_station_names_len'] = servicekontrakt_df_T23['original_station_names'].apply(lambda x: len(x))
servicekontrakt_df_T23['station_codes_len'] = servicekontrakt_df_T23['station_codes'].apply(lambda x: len(x))
# print rows where these are different
print(servicekontrakt_df_T23[servicekontrakt_df_T23['original_station_names_len'] != servicekontrakt_df_T23['station_codes_len']])

Empty DataFrame
Columns: [Kontraktsområdesnamn, Tidsperiod, Bandel, TPA timmar per år, TPA dagar per år, TPA veckor per år, TPA timmar natt per år, TPA timmar helg per år, EJ TPA timmar per år, EJ TPA dagar per år, EJ TPA veckor per år, EJ TPA timmar natt per år, EJ TPA timmar helg per år, Bandelnr, Bandelnamn, Start_year, End_year, station_details, original_station_names, station_codes, short_path, sum_langd, kontrakt_från_bandel, original_station_names_len, station_codes_len]
Index: []

[0 rows x 25 columns]


In [56]:
# check if there are any rows in tcr_df where sum_langd is None, print them
print(tcr_df[tcr_df['sum_langd'].isna()])

      TCR-id          Område Klassificering             Starttid  \
559      214            Väst      Medelstor  2023-04-06 02:00:00   
560      218            Väst          Liten  2023-09-10 22:15:00   
561      218            Väst          Liten  2023-09-11 22:15:00   
562      218            Väst          Liten  2023-09-12 22:15:00   
563      218            Väst          Liten  2023-09-13 22:15:00   
...      ...             ...            ...                  ...   
6908     688  Nord/Mellersta          Liten  2023-08-30 03:35:00   
6909     688  Nord/Mellersta          Liten  2023-08-31 03:35:00   
6910     688  Nord/Mellersta          Liten  2023-09-01 03:35:00   
6911     688  Nord/Mellersta          Liten  2023-09-02 03:35:00   
6912     688  Nord/Mellersta          Liten  2023-09-03 03:35:00   

                  Sluttid Från trafikplats Från linjespår Från inkluderad  \
559   2023-04-10 02:00:00             Kogr              E            Helt   
560   2023-09-11 05:15:00    

In [57]:
# find if all bandel numbers in tcr_df and servicekontrakt_df_T23 belong to the same contract, print the bandel numbers that do not belong to the same contract


## Export to Excel files

In [58]:
# add column Total timmar per år by adding TPA timmar per år and EJ TPA timmar per år
servicekontrakt_df_T23['Total timmar per år'] = servicekontrakt_df_T23['TPA timmar per år'] + servicekontrakt_df_T23['EJ TPA timmar per år']

In [59]:
#  for excel
excel_file_path = "Servicekontrakt_per_bandel_matched_T23.xlsx"
servicekontrakt_df_to_export = servicekontrakt_df_T23[['Kontraktsområdesnamn', 'kontrakt_från_bandel', 'Tidsperiod', 'Bandel', 'TPA timmar per år',
       'TPA dagar per år', 'TPA veckor per år', 'TPA timmar natt per år',
       'TPA timmar helg per år', 'EJ TPA timmar per år', 'EJ TPA dagar per år',
       'EJ TPA veckor per år', 'EJ TPA timmar natt per år',
       'EJ TPA timmar helg per år', 'Total timmar per år', 'Bandelnr',
       'Bandelnamn', 'sum_langd']]
servicekontrakt_df_to_export.to_excel(excel_file_path, index=False)

In [60]:
# Step 9: Keep only the specified columns in tcr_df
tcr_df_to_export = tcr_df[['TCR-id', 'Klassificering',
                 'Starttid', 'Sluttid', 'Servicefönster_nya_kategorier',
                 'Relaterade TPÅ:er', 'tid_timmar', 'Relaterad åtgärdsnummer',
                 'förbind_list', 'identified_BdlNr', 'sum_langd', 'Kontraktsområdesnamn', 'kontrakt_från_bandel']]

# Step 10: Export the DataFrame to an Excel file
excel_file_path = "TCR_T23_matched_bandelar.xlsx"
tcr_df_to_export.to_excel(excel_file_path, index=False)