# Include static passengers (T23)

After running the script for matching train data (from T23) with traffic line, a CSV file containing the matching (with matching scores) is produced.

We use that CSV file here to populate the train data (T23) with passenger data.

## Read input data

We read all data from existing files, i.e., static ridership, matching (line - train), and train data.

In [1]:
# import excel file static_pass_all_2024.xlsx
import pandas as pd

# read by default 1st sheet of an excel file
df_line = pd.read_excel('../data/output_data/static_pass_all_2024.xlsx')

# read the csv file with matching result
df_matching = pd.read_excel('../data/output_data/train_data_matched_lines.xlsx')

# Ny datafil med alla RST tåg som har fått registrering av Infrastruktur händelser på södrastambanan
df_train = pd.read_csv('../data/train_data_2023/traindata_2023_passenger_SSBevents_012025.csv')

In [2]:
print(df_line.columns)
print(df_matching.columns)
print(df_train.columns)

Index(['Linje', 'först_sign', 'sist_sign', 'först', 'sist', 'från_sign',
       'till_sign', 'från', 'till', 'ombord_pr_nat', 'ombord_tj_nat',
       'ombord_arb_nat', 'ombord_arb_reg', 'ombord_tj_reg', 'ombord_övr_reg',
       'avstigande_pr_nat', 'avstigande_tj_nat', 'avstigande_arb_nat',
       'avstigande_arb_reg', 'avstigande_tj_reg', 'avstigande_övr_reg',
       'påstigande_pr_nat', 'påstigande_tj_nat', 'påstigande_arb_nat',
       'påstigande_arb_reg', 'påstigande_tj_reg', 'påstigande_övr_reg'],
      dtype='object')
Index(['resa', 'Tågnr', 'Tåguppdrag', 'Tågsort', 'Start_uppdrag_sign',
       'Slut_uppdrag_sign', 'Start_resa_sign', 'Slut_resa_sign', 'Stopps',
       'Plats_len', 'Predicted_Line', 'Score', 'Stopps_line'],
      dtype='object')
Index(['PlanDatum', 'PlanTidpunkt', 'Datum', 'Tågnr', 'Tåguppdrag', 'Tågläge',
       'Plats', 'Riktning', 'Orsakskod', 'Nivå1', 'Nivå2', 'Nivå3',
       'Orsakande tågnr', 'HändelseNr', 'Tidsavvikelse',
       'Registrerad merförsening', 

## Preprocessing

we do some preprocessing of the input data.

We start by converting Plats in df_train to signature.

In [3]:
df_train_rst = df_train[df_train['Tågslag'] == 'RST']
# upper case for columns 'Plats' 'StartStation_resa', 'SlutStation_resa','StartStation_uppdrag', 'SlutStation_uppdrag
df_train_rst.loc[:, 'Plats'] = df_train_rst['Plats'].str.upper()
df_train_rst.loc[:,'StartStation_resa'] = df_train_rst['StartStation_resa'].str.upper()
df_train_rst.loc[:,'SlutStation_resa'] = df_train_rst['SlutStation_resa'].str.upper()
df_train_rst.loc[:,'StartStation_uppdrag'] = df_train_rst['StartStation_uppdrag'].str.upper()
df_train_rst.loc[:,'SlutStation_uppdrag'] = df_train_rst['SlutStation_uppdrag'].str.upper()

In [4]:
# read ../data/useful_data/Plats_sign.csv
df_plats_sign = pd.read_csv('../data/useful_data/Plats_sign_augmented_v2.csv')

# add column 'Plats_sign' to df_train_rst
df_train_SIGN = df_train_rst.copy()
df_train_SIGN['Plats_sign'] = df_train_rst['Plats'].map(df_plats_sign.set_index('Plats')['Plats_sign'])

# print Plats of the rows with missing Plats_sign
print(df_train_SIGN[df_train_SIGN['Plats_sign'].isnull()]['Plats'].unique())

['RUNSALA' 'DIÖ']


In [5]:
# add to df_plats_sign manually the following stations and signatures
# (DIÖ NORRA, DIÖ), (MORA STRAND, MRAS) and (RUNSALA, ?)

## Augment with total onbord

In [None]:
import pandas as pd

# ---- Step 1. Merge predicted line into df_train_SIGN based on 'resa'
# This adds a column 'Predicted_Line' to each train journey in df_train_SIGN.
df_train_rst_aug = pd.merge(
    df_train_SIGN,
    df_matching[['resa', 'Predicted_Line']],
    on='resa'
)

# There are some trains with no boarding or alighting events. These are not matched to any line.
# we will remove them
df_train_rst_aug = df_train_rst_aug.dropna(subset=['Predicted_Line'])

# calculate the number of passengers on board
df_line['Total_ombord'] = df_line[['ombord_pr_nat', 'ombord_tj_nat', 'ombord_arb_nat', 
                                              'ombord_arb_reg', 'ombord_tj_reg', 'ombord_övr_reg']].sum(axis=1)
# calculate the number of passengers alighted
df_line['Total_avstigande'] = df_line[['avstigande_pr_nat', 'avstigande_tj_nat', 'avstigande_arb_nat',
                                                 'avstigande_arb_reg', 'avstigande_tj_reg', 'avstigande_övr_reg']].sum(axis=1)
# calculate the number of passengers boading
df_line['Total_påstigande'] = df_line[['påstigande_pr_nat', 'påstigande_tj_nat', 'påstigande_arb_nat',
                                                 'påstigande_arb_reg', 'påstigande_tj_reg', 'påstigande_övr_reg']].sum(axis=1)

Once we merged train data with the predicted line, and also prepared df_line with the total number of passengers. We now add columns for number of pax onboard, alighting and boarding df_train_rst_aug.

For nb of pax boarding, the idea is to add the corresponding nb of boarding (given predicted line and Plats = från) for any row where column Riktning is 'avgång'. For nb of pax alighting, we do that for the corresponding nb of alighting (given predicted line and Plats = till) for any row where column Riktning is 'ankomst'. 

In [None]:
# For boarding events: Riktning == 'Avgång'
boarding_df = df_train_rst_aug[df_train_rst_aug['Riktning'] == 'Avgång'].copy()

boarding_df = pd.merge(
    boarding_df,
    df_line[['Linje', 'från_sign', 'Total_påstigande', 'Total_ombord']],
    left_on=['Predicted_Line', 'Plats_sign'],   # Matching: predicted line and station as origin
    right_on=['Linje', 'från_sign'],
    how='left'
)
# Optionally drop redundant merge keys:
boarding_df = boarding_df.drop(columns=['Linje', 'från_sign'])

# For alighting events: Riktning == 'Ankomst'
alighting_df = df_train_rst_aug[df_train_rst_aug['Riktning'] == 'Ankomst'].copy()
alighting_df = pd.merge(
    alighting_df,
    df_line[['Linje', 'till_sign', 'Total_avstigande', 'Total_ombord']],
    left_on=['Predicted_Line', 'Plats_sign'],   # Matching: predicted line and station as destination
    right_on=['Linje', 'till_sign'],
    how='left'
)
# Optionally drop redundant merge keys:
alighting_df = alighting_df.drop(columns=['Linje', 'till_sign'])

# Combine the dataframes back
df_train_rst_aug_final = pd.concat([boarding_df, alighting_df], ignore_index=True)

There are rows with missing values of total_ombord. For these, we will consider the same value from the previous row (if any), from the same resa.
Before that, we will have to group by resa and sort according to PlanTidpunkt (and then Riktning).

In [None]:
# Sort by resa, PlanTidpunkt, and Riktning so that the previous row within each resa is the one before in time
df_train_rst_aug_final = df_train_rst_aug_final.sort_values(by=['resa', 'PlanTidpunkt', 'Riktning'])

# Reset index to ensure the indices match
df_train_rst_aug_final = df_train_rst_aug_final.reset_index(drop=True)

# Group by resa and forward-fill missing Total_ombord values; fill any remaining NaN with -1 if needed.
df_train_rst_aug_final['Total_ombord'] = df_train_rst_aug_final.groupby('resa')['Total_ombord'].transform(lambda x: x.ffill().fillna(-1))

Rows with missing total ombord (or equal to -1), are replaced with average ridership over all the predicted line.

In [None]:
# Calculate the average Total_ombord for each predicted line,
# ignoring rows where Total_ombord is -1.
line_avg = (
    df_train_rst_aug_final.loc[df_train_rst_aug_final['Total_ombord'] != -1]
    .groupby('Predicted_Line')['Total_ombord']
    .mean()
    .reset_index()
    .rename(columns={'Total_ombord': 'Avg_Total_ombord'})
)
# Merge the average onboard values into the main DataFrame.
df_train_rst_aug_final = pd.merge(
    df_train_rst_aug_final,
    line_avg,
    on='Predicted_Line',
    how='left'
)

# Replace rows where Total_ombord == -1 with the average for the predicted line.
df_train_rst_aug_final.loc[
    df_train_rst_aug_final['Total_ombord'] == -1, 'Total_ombord'
] = df_train_rst_aug_final.loc[
    df_train_rst_aug_final['Total_ombord'] == -1, 'Avg_Total_ombord'
]

# Optionally, drop the temporary column.
df_train_rst_aug_final = df_train_rst_aug_final.drop(columns=['Avg_Total_ombord'])

## Export data

In [None]:
# export data to excel file, keep the following columns: 'resa', 'PlanTidpunkt', 'Riktning', 'Plats', 'Plats_sign', 'Total_ombord'
columns_to_keep = ['resa', 'PlanTidpunkt', 'Riktning', 'Plats' ,'Plats_sign', 'Total_ombord']
df_train_rst_aug_final_to_export = df_train_rst_aug_final[columns_to_keep]
df_train_rst_aug_final_to_export.to_excel('../data/output_data/train_data_matched_lines_with_passengers.xlsx', index=False)