In [1]:
import os
os.chdir(..)

In [2]:
import pandas as pd
from pathlib import Path

In [3]:
import warnings
warnings.filterwarnings(ignore)

In [195]:
# Data is huge, ~15M records per month. Use only necessary columns.
# Load last three month data and make ready a pandas dataframe.

def load_month(path):
    return pd.read_parquet(path, columns=REQ_COLUMNS)

REQ_COLUMNS = [
    'eva',
    'station_name',
    'train_name',
    'delay_in_min',
    'time',
    'is_canceled'
]

data_dir = Path('data')

files = [
    data_dir / "data-2025-10.parquet",
    data_dir / "data-2025-11.parquet",
    data_dir / "data-2025-12.parquet",
]

df = pd.concat([load_month(f) for f in files], ignore_index=True)

In [27]:
# Basic cleaning
# Remove cancelled trains, to focus delay magnitude
# !NOTE: Should we reatin cancelled ones?

df = df[
    (df[is_canceled] == False) &
    (df[delay_in_min].notna())
]

df[time] = pd.to_datetime(df[time])
df[date] = df[time].dt.date

In [28]:
# Create a new column to uniquely identify the route

df[route_id] = (
    df[train_name].astype(str) +  '→'  + df[station_name].astype(str)
)

In [30]:
# Sample data: only include top and busiest routes

route_counts = (
    df.groupby(route_id)
      .size()
      .sort_values(ascending=False)
)

# Keep top N routes
TOP_N_ROUTES = 150

major_routes = route_counts.head(TOP_N_ROUTES).index

df = df[df[route_id].isin(major_routes)]

In [196]:
df

Unnamed: 0,eva,station_name,train_name,delay_in_min,time,is_canceled
0,08002549,Hamburg Hbf,ME RB41,34,2025-10-01 00:00:00,False
1,08000262,München Ost,S 1,7,2025-10-01 00:00:00,False
2,08000207,Köln Hbf,S 11,16,2025-10-01 00:00:00,True
3,08000096,Stuttgart Hbf,RE 8,8,2025-10-01 00:00:00,False
4,08000086,Duisburg Hbf,RE 3,7,2025-10-01 00:00:00,False
...,...,...,...,...,...,...
31396203,08000247,Marktredwitz,RE RE31,0,2025-12-31 23:59:00,False
31396204,08089201,Flughafen BER,S S9,0,2025-12-31 23:59:00,False
31396205,08004148,München-Giesing,S S5,0,2025-12-31 23:59:00,False
31396206,08006346,Wernau (Neckar),S S1,0,2025-12-31 23:59:00,False


In [34]:
daily_route_delays = (
    df.groupby([route_id, date])
      .agg(
          mean_delay=(delay_in_min, mean),
          sd_delay=(delay_in_min, std),
          n_trains=(delay_in_min, size)
      )
      .reset_index()
)


daily_route_delays[date] = pd.to_datetime(daily_route_delays[date])
daily_route_delays[weekday] = daily_route_delays[date].dt.weekday
daily_route_delays[is_weekend] = daily_route_delays[weekday] >= 5

In [35]:
daily_route_delays

Unnamed: 0,route_id,date,mean_delay,sd_delay,n_trains,weekday,is_weekend
0,NWB RS1 → Bremen Hbf,2025-10-01,4.034483,4.383628,116,2,False
1,NWB RS1 → Bremen Hbf,2025-10-02,2.546875,3.111319,64,3,False
2,NWB RS1 → Bremen Hbf,2025-10-03,1.457143,1.578213,35,4,False
3,NWB RS1 → Bremen Hbf,2025-10-04,1.818182,2.805913,44,5,True
4,NWB RS1 → Bremen Hbf,2025-10-05,1.866667,2.431593,30,6,True
...,...,...,...,...,...,...,...
8323,STB 1 → Brebach,2025-12-27,0.000000,0.000000,226,5,True
8324,STB 1 → Brebach,2025-12-28,0.000000,0.000000,147,6,True
8325,STB 1 → Brebach,2025-12-29,0.000000,0.000000,258,0,False
8326,STB 1 → Brebach,2025-12-30,0.000000,0.000000,258,1,False


### Read stations data and try to see if its a match for our original data. 

Data: https://www.kaggle.com/datasets/mexwell/deutsche-bahn-train-stops-germany

In [None]:

db = pd.read_excel("data/db_data.xlsx")
db = db[['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']]

In [176]:
def concat(t):
    return "".join([str(i).replace('"', '') for i in t[1]]).split(';')

In [179]:
rows = [concat(i) for i in db.iterrows()]
rows = pd.DataFrame(rows).drop([27, 28], axis=1)

In [180]:
cols = "EVA_NR;DS100;IFOPT;db_name;traffic;operator_name;operator_nr;status;distance;formatted;lat;lon;housenumber;name;street;postcode;district;suburb;county;state;state_code;city;country;country_code;attribution;attribution_license;attribution_url"
cols = cols.split(";")
rows.columns = cols

In [None]:
rows[rows['EVA_NR']=='08002549']

SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (2873181750.py, line 1)

In [204]:
df[df['eva']=='8001944']

Unnamed: 0,eva,station_name,train_name,delay_in_min,time,is_canceled


In [202]:
rows['EVA_NR']

'8002551'

In [142]:
rows

Unnamed: 0,EVA_NR,DS100,IFOPT,db_name,traffic,operator_name,operator_nr,status,distance,formatted,...,suburb,county,state,state_code,city,country,country_code,attribution,attribution_license,attribution_url
0,8002551,"""AELB""","""de:02000:11943""","""Hamburg Elbbr¸cken""","""RV""","""DB Station und Service AG""",,"""neu""",1.8856038526558858,"""Elbbr¸cken Freihafenelbbr¸cke 20457 Hamburg G...",...,"""HafenCity""",,,,"""Hamburg""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright"""
1,8001944,"""TETN""",,"""Eutingen Nord""","""RV""","""DB Station und Service AG""",,"""neu""",5.605502032267216,"""Eutingen Nord K 4715 72184 Gˆttelfingen Germany""",...,,"""Landkreis Freudenstadt""","""Baden-W¸rttemberg""","""BW""","""Gˆttelfingen""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright"""
2,8003074,"""MIA""",,"""Ingolstadt Audi""","""RV""","""DB Station und Service AG""",,"""neu""",12.416024252338556,"""Bahnhof Ingolstadt Audi Weststraﬂe 85057 Ingo...",...,"""Audi AG""","""Pfaffenhofen an der Ilm""","""Bavaria""",,"""Ingolstadt""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright"""
3,8001723,"""HEBA""",,"""Einbeck Otto-Hahn-Straﬂe""","""RV""","""Ilmebahn GmbH""",,"""neu""",70.81814022102819,"""Grimsehlstraﬂe 39 37574 Einbeck Germany""",...,,"""Northeim""","""Lower Saxony""",,"""Einbeck""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright""nan"
4,8004371,"""KRO""",,"""Nˆrvenich-Rommelsheim""","""nur DPN""","""Rurtalbahn GmbH""",,"""neu""",109.13706146313359,"""Gertrudisstraﬂe 4 52388 Nˆrvenich Germany""",...,,"""D¸ren""","""North Rhine-Westphalia""",,"""Nˆrvenich""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright""nan"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,8004467,"""HNDH""","""de:03456:208""","""Nordhorn""","""nur DPN""","""Bentheimer Eisenbahn AG""",0,,1.9495406425544366,"""Frensdorfer Ring 48529 Nordhorn Germany""",...,"""Altendorf""","""Landkreis Grafschaft Bentheim""","""Lower Saxony""",,"""Nordhorn""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright""nan"
5996,8004261,"""HNES""","""de:03456:4851""","""Neuenhaus S¸d""","""nur DPN""","""Bentheimer Eisenbahn AG""",0,,2.2783050813636625,"""Neuenhaus-S¸d 49828 Neuenhaus Germany""",...,,"""Grafschaft Bentheim""","""Lower Saxony""",,"""Neuenhaus""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright""nan"
5997,8004262,"""HNEH""","""de:03456:4781""","""Neuenhaus""","""nur DPN""","""Bentheimer Eisenbahn AG""",0,,1.6489214529360758,"""Am Bahnhof 49828 Neuenhaus Germany""",...,,"""Landkreis Grafschaft Bentheim""","""Lower Saxony""",,"""Neuenhaus""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright""nan"
5998,8000651,"""SAM""","""de:10041:8000651""","""Auersmacher""","""nur DPN""","""DB Station und Service AG""",213,,3.211442632535484,"""Auersmacher Kreisstraﬂe 66271 Kleinblittersdo...",...,"""Auersmacher""","""Regionalverband Saarbr¸cken""","""Saarland""","""SL""","""Kleinblittersdorf""","""Germany""","""de""","""© OpenStreetMap contributors""","""Open Database License""","""https://www.openstreetmap.org/copyright"""


In [None]:
# Another source of station data
d = pd.read_csv('data/worldcities.csv')
d[d['country'] == 'Germany']

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
142,Berlin,Berlin,52.5200,13.4050,Germany,DE,DEU,Berlin,primary,4679500.0,1276451290
287,Stuttgart,Stuttgart,48.7775,9.1800,Germany,DE,DEU,Baden-Württemberg,admin,2787724.0,1276171358
310,Munich,Munich,48.1375,11.5750,Germany,DE,DEU,Bavaria,admin,2606021.0,1276692352
327,Hamburg,Hamburg,53.5500,10.0000,Germany,DE,DEU,Hamburg,admin,2496600.0,1276041799
687,Cologne,Cologne,50.9364,6.9528,Germany,DE,DEU,North Rhine-Westphalia,,1087353.0,1276015998
...,...,...,...,...,...,...,...,...,...,...,...
46429,Kirn,Kirn,49.7881,7.4572,Germany,DE,DEU,Rhineland-Palatinate,,8523.0,1276119778
46431,Barntrup,Barntrup,51.9831,9.1167,Germany,DE,DEU,North Rhine-Westphalia,,8522.0,1276882548
46467,Bad Feilnbach,Bad Feilnbach,47.7833,12.0167,Germany,DE,DEU,Bavaria,,8512.0,1276018241
46515,Ahrensbök,Ahrensbok,54.0167,10.5833,Germany,DE,DEU,Schleswig-Holstein,,8502.0,1276154754
