# Matching trains with traffic lines

The main goal here is to develop a function that makes it possible to identify the line of a specific delayed train. The reason why we need such a function is because the passenger ridership estimation is given per line where the delay data is per specific train.

## Importing datasets

First, let us import the dataset for all the traffic lines (used in the ridership estimation data).

In [13]:
# import excel file static_pass_all_2024.xlsx
import pandas as pd

# read by default 1st sheet of an excel file
df_line = pd.read_excel('../../data/output_data/static_pass_all_2024.xlsx')

In [14]:
# drop all the columns except the first 3 (no need for ridership data, only the line number, name and stopping patterns are of interest)
df_line = df_line.iloc[:, :9]

Let us now import the train data, more specifically the trains that are affected by delays. Of interest here are particularly Tågnr	and Tåguppdrag.
The goal is to match all of them to a specific line number in df_line.

In [15]:
# Ny datafil med alla RST tåg som har fått registrering av Infrastruktur händelser på södrastambanan
df_train = pd.read_csv('../../data/train_data_2023/traindata_2023_passenger_SSBevents_012025.csv') 

We need to clean up (make this df a bit smaller), e.g., by removing unnecessary data.

In [16]:
df_train_rst = df_train[df_train['Tågslag'] == 'RST']

In [17]:
# keep only the following columns
# 'resa', 'Tågnr', 'Tåguppdrag', 'Plats', 'Tågslag', 'Tågsort', 'Aktivitetskod', 'Aktivitetskodbeskrivning'
# 'PlanDatum', 'PlanTidpunkt', 'Datum',  'Tågläge', 'StartStation_resa', 'SlutStation_resa','StartStation_uppdrag', 'SlutStation_uppdrag
df_train_rst_clean = df_train_rst[['resa', 'Tågnr', 'Tåguppdrag', 'Plats', 'Tågslag', 'Tågsort', 'Aktivitetskod', 'Aktivitetskodbeskrivning', 'PlanDatum', 'PlanTidpunkt', 'Datum',  'Tågläge', 'StartStation_resa', 'SlutStation_resa','StartStation_uppdrag', 'SlutStation_uppdrag']]
df_train_rst_clean = df_train_rst_clean.reset_index(drop=True)

In [18]:
# remove all rows where Aktivitetskodbeskrivning is not 'Påstigande av resande', or 'Av- och påstigande av resande' or 'Avstigande av resande'
df_train_rst_clean = df_train_rst_clean[df_train_rst_clean['Aktivitetskodbeskrivning'].isin(['Påstigande av resande', 'Av- och påstigande av resande', 'Avstigande av resande'])]

In [19]:
# First, clean up df_plats_sign
df_plats_sign = pd.read_excel('../../data/useful_data/Förbindelselinje_2023_alla.xlsx')
df_plats_sign = df_plats_sign.dropna(subset=['Plats'])
df_plats_sign = df_plats_sign[['Plats', 'Plats_sign']].drop_duplicates()

In [20]:
# # save df_plats_sign to a csv file, keep columns 'Plats' and 'Plats_sign'
# df_plats_sign.to_csv('plats_sign_2023_alla.csv', index=False)

In [21]:
import re 

# Define the regex matching function
def match_station(station, plats_df):
    station_variants = [
        station,
        re.sub(r'central', 'c', station, flags=re.IGNORECASE),
        re.sub(r'(\w+)( central)', r'\1s central', station, flags=re.IGNORECASE),
        re.sub(r'(\w+)( c)', r'\1s c', station, flags=re.IGNORECASE),
        re.sub(r'(\w+)s central', r'\1 central', station, flags=re.IGNORECASE),
        re.sub(r'(\w+)s central', r'\1 c', station, flags=re.IGNORECASE),
    ]
    
    # if station is Marieholm, try matching Göteborg Marieholm
    if station.lower() == 'marieholm':
        station_variants.append('Göteborg Marieholm')
    # if station is Helsingborg godsbangård, try matching Helsingborgs godsbangård
    if station.lower() == 'helsingborg godsbangård':
        station_variants.append('Helsingborgs godsbangård')
    # if station is Hallsbergs pbg, try matching Hallsbergs personbangård
    if station.lower() == 'hallsbergs pbg':
        station_variants.append('Hallsbergs personbangård')

    # if station is Stockholm Södra, try Stockholms Södra
    if station.lower() == 'stockholm södra':
        station_variants.append('Stockholms Södra')

    # if station is Falkenbergs personstation, try matching Falkenberg personstation
    if station.lower() == 'falkenbergs personstation':
        station_variants.append('Falkenberg personstation')

    # if station is Köpingebro , try matching f.d. Köpingebro
    if station.lower() == 'köpingebro':
        station_variants.append('f.d. Köpingebro')

    for variant in station_variants:
        matches = plats_df[plats_df['Plats'].str.match(re.escape(variant), case=False, na=False)]
        if not matches.empty:
            return matches['Plats_sign'].iloc[0]
    return None

# Filter activities first
df_train_rst_clean = df_train_rst_clean[df_train_rst_clean['Aktivitetskodbeskrivning'].isin([
    'Påstigande av resande', 
    'Av- och påstigande av resande', 
    'Avstigande av resande'
])]

# Create initial mapping for station signs
df_train_rst_clean = df_train_rst_clean.merge(
    df_plats_sign[['Plats', 'Plats_sign']], 
    left_on='Plats', 
    right_on='Plats', 
    how='left'
)

In [22]:
# Handle unmatched stations
unmatched_stations = df_train_rst_clean[df_train_rst_clean['Plats_sign'].isna()]['Plats'].unique()

# Apply regex matching for unmatched stations
for station in unmatched_stations:
    match = match_station(station, df_plats_sign)
    if match:
        df_train_rst_clean.loc[df_train_rst_clean['Plats'] == station, 'Plats_sign'] = match

In [23]:
# Read the additional station mapping file
df_plats_sign_pos = pd.read_excel('../../data/useful_data/Plats_sign_pos.xlsx')

# rename column Signatur to Plats_sign
df_plats_sign_pos.rename(columns={'Signatur': 'Plats_sign'}, inplace=True)

In [24]:
# Third attempt: Regex matching using additional file for remaining unmatched stations
unmatched_stations = df_train_rst_clean[df_train_rst_clean['Plats_sign'].isna()]['Plats'].unique()

In [25]:
for station in unmatched_stations:
    match = match_station(station, df_plats_sign_pos)
    if match:
        df_train_rst_clean.loc[df_train_rst_clean['Plats'] == station, 'Plats_sign'] = match

In [26]:
# Convert all station signs to uppercase
df_train_rst_clean['Plats_sign'] = df_train_rst_clean['Plats_sign'].str.upper()

In [27]:
# keep the full name for unmatched stations
df_train_rst_clean.loc[df_train_rst_clean['Plats_sign'].isna(), 'Plats_sign'] = df_train_rst_clean.loc[df_train_rst_clean['Plats_sign'].isna(), 'Plats']

## Extracting stops from train data

Before trying to find the closest line (line number/name) to a certain train (resa, i.e, tågnr-uppdrag-datum). Let us extract the stops.
First, we append the stopping pattern information to our delayed trains.

In [28]:
# Create a new dataframe with stops from 'Plats' column
train_resa_stops = df_train_rst_clean.groupby(['resa', 'Tåguppdrag'])['Plats_sign'].agg(list).reset_index()

# If there are duplicates in the 'Plats' lists, we can remove them while preserving order
train_resa_stops['Plats_sign'] = train_resa_stops['Plats_sign'].apply(lambda x: list(dict.fromkeys(x)))
train_resa_stops['Plats_len'] = train_resa_stops['Plats_sign'].apply(lambda x: len(x))

# rename column Plats_sign to Stopps
train_resa_stops.rename(columns={'Plats_sign': 'Stopps'}, inplace=True)

# Convert lists to tuples to make them hashable
train_resa_stops['Stopps'] = train_resa_stops['Stopps'].apply(tuple)

In [29]:
# create a reduced version of train_resa_stops with only Tåguppdrag and Plats while keeping the row with the longest list of stops, call it train_resa_stops_taguppdrag
train_resa_stops = train_resa_stops.sort_values(by='Plats_len', ascending=False)

# when there are duplicates in 'Tåguppdrag', keep the row with the highest 'Plats_len'
train_stops_no_duplicates = train_resa_stops.drop_duplicates(subset=['Tåguppdrag'], keep='first')

train_stops_no_duplicates = train_stops_no_duplicates[['resa','Tåguppdrag', 'Stopps']]

In [30]:
# Group by 'Linje' and combine the 'från_sign' and 'till_sign' for each line
line_stops = df_line.groupby('Linje').apply(
    lambda x: list(x['från_sign']) + [x['till_sign'].iloc[-1]]
).reset_index()

# Rename columns for clarity
line_stops.columns = ['Linje', 'Stopps']

  line_stops = df_line.groupby('Linje').apply(


## Matching delayed trains to traffic lines

We now match delayed trains (subset with unique stop patterns) to the most likely traffic line. The most likely line is chosen as the one with the highest similarity score.

In [31]:
import pandas as pd
from difflib import SequenceMatcher

def calculate_score(train_stops, line_stops):
    """
    Calculate a similarity score between train stops and line stops.
    """
    # Match first and last stop
    score = 0

    if train_stops[0] == line_stops[0]:
        score += 2  # Higher weight for matching first stop
    if train_stops[-1] == line_stops[-1]:
        score += 2  # Higher weight for matching last stop
    
    # Calculate sequence similarity for intermediate stops
    sequence_similarity = SequenceMatcher(None, train_stops, line_stops).ratio()
    score += sequence_similarity * 10  # Adjust weight for sequence similarity
    
    return score

def get_inverted_line(line_id):
    """
    Get the inverted line ID.
    """
    return line_id[:-1] if line_id.endswith('R') else f"{line_id}R"

def match_trains_to_lines(train_stops_df, line_stops_df):
    """
    Match trains to lines based on similarity scores, including inverted stops.
    """
    matches = []
    for _, train_row in train_stops_df.iterrows():
        best_score = -1
        best_match = None
        best_direction = 'Normal'

        # if train_row has no stops, set best score to -1 and continue
        if len(train_row['Stopps']) > 0:
            for _, line_row in line_stops_df.iterrows():

                # Calculate score for normal stops
                normal_score = calculate_score(train_row['Stopps'], line_row['Stopps'])
                
                # Calculate score for inverted stops
                inverted_stops = line_row['Stopps'][::-1]
                inverted_score = calculate_score(train_row['Stopps'], inverted_stops)
                
                # Determine better match (normal or inverted)
                if inverted_score > normal_score:
                    current_score = inverted_score
                    current_match = get_inverted_line(line_row['Linje'])
                    current_direction = 'Inverted'
                else:
                    current_score = normal_score
                    current_match = line_row['Linje']
                    current_direction = 'Normal'
                
                # Update best match
                if current_score > best_score:
                    best_score = current_score
                    best_match = current_match
                    best_direction = current_direction
        
        matches.append({
            'resa': train_row['resa'],
            'Tåguppdrag': train_row['Tåguppdrag'],
            'Predicted_Line': best_match,
            'Score': best_score,
            'Direction': best_direction
        })
    
    return pd.DataFrame(matches)


matching_result = match_trains_to_lines(train_stops_no_duplicates, line_stops)

In [32]:
# Add a column to results corresponding stops of the predicted line
matching_result_stops = pd.merge(matching_result, line_stops, left_on='Predicted_Line', right_on='Linje', how='left').rename(columns={'Stopps': 'Stopps_line'})
matching_result_stops = pd.merge(matching_result_stops, train_stops_no_duplicates, left_on='resa', right_on='resa', how='left').rename(columns={'Stopps': 'Stopps_train'})
matching_result_stops.drop(columns=['Linje', 'Tåguppdrag_y'], inplace=True)
# rename column Tåguppdrag_x to Tåguppdrag
matching_result_stops.rename(columns={'Tåguppdrag_x': 'Tåguppdrag'}, inplace=True)

Now, we can construct the final table where all the trains are identified with a specific traffic line.

In [33]:
# export matching_result_stops to Excel, keep only the columns 'Tåguppdrag', 'Predicted_Line', 'Score', 'Stopps_line', 'Stopps_train'
columns_to_keep = ['Tåguppdrag', 'Predicted_Line', 'Score', 'Stopps_line', 'Stopps_train']
matching_result_stops = matching_result_stops[columns_to_keep]
matching_result_stops.to_excel('../../data/output_data/matching_result_stops.xlsx', index=False)