In [2]:
import pandas as pd
import partridge as ptg

# Path to the GTFS zip file
zip_path = "../../GTFS.zip"

# Load stops.txt directly with partridge
geo_feed= ptg.load_geo_feed(zip_path)
bus_lines = [561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 573, 574, 575]
print("List of bus lines:", bus_lines)

List of bus lines: [561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 573, 574, 575]


In [3]:
# Filter the GTFS dataset for the specified bus lines
routes_df = geo_feed.routes
nm_routes = routes_df[routes_df['route_short_name'].astype(str).isin([str(x) for x in bus_lines])]
print(f"Filtered routes for bus lines {bus_lines}:")

# Filter all GTFS schedule tables for the selected bus lines
nm_route_ids = nm_routes['route_id']
trips_df = geo_feed.trips
nm_trips = trips_df[trips_df['route_id'].isin(nm_route_ids)]

# stop times
stop_times_df = geo_feed.stop_times
nm_stop_times = stop_times_df[stop_times_df['trip_id'].isin(nm_trips['trip_id'])]

# stops
stops_df = geo_feed.stops
nm_stops = stops_df[stops_df['stop_id'].isin(nm_stop_times['stop_id'])]

# fix parent stations outside stadtwerke
mask = ~nm_stops['parent_station'].isin(nm_stops['stop_id'])
nm_stops.loc[mask, 'parent_station'] = "" 

# calendars
calendar_df = geo_feed.calendar if hasattr(geo_feed, 'calendar') else None
calendar_dates_df = geo_feed.calendar_dates if hasattr(geo_feed, 'calendar_dates') else None

# transfers
transfer_df = geo_feed.transfers

# fix transfers only pick ones that do not go out of scope
mask = transfer_df['to_stop_id'].isin(nm_stops["stop_id"])
transfer_df = transfer_df.loc[mask] 
mask = transfer_df['from_stop_id'].isin(nm_stops["stop_id"])
nm_transfers = transfer_df.loc[mask] 

#nm_agency =  #geo_feed.agency
nm_agency = pd.DataFrame(
    [['VGN', 'VGN', 'http://www.vgn.de', 'Europe/Berlin', 'DE', '+49 (0)911 27075-99']],
    columns=[
        'agency_id',
        'agency_name',
        'agency_url',
        'agency_timezone',
        'agency_lang',
        'agency_phone'
    ]
)
data = {
    'feed_publisher_name': ['VGN'],
    'feed_publisher_url': ['http://www.vgn.de'],
    'feed_lang': ['de'],
    'feed_version': ['1.0'],
    'feed_start_date': ['20250810'],
    'feed_end_date': ['20251108'],
    "feed_contact_email": ["email@email.com"],
    'feed_contact_url': ["http://url.com"]
}

# Create the pandas DataFrame
nm_feed_info = pd.DataFrame(data)

nm_routes.loc[:,"agency_id"]= "VGN"
nm_routes.loc[nm_routes["route_id"]=="39-565-j25-2", "route_long_name"] = "Neumarkt Bahnhof - Zurück"

gtfs_tables = {
    "trips.csv": nm_trips,
    "stop_times.csv": nm_stop_times,
    "stops.csv": nm_stops,
    "routes.csv": nm_routes,
    "transfers.csv": nm_transfers,
    "agency.csv": nm_agency,
    "feed_info.csv": nm_feed_info
}

if calendar_df is not None:
    gtfs_tables["calendar.csv"] = calendar_df[calendar_df['service_id'].isin(nm_trips['service_id'])]
if calendar_dates_df is not None:
    gtfs_tables["calendar_dates.csv"] = calendar_dates_df[calendar_dates_df['service_id'].isin(nm_trips['service_id'])]


Filtered routes for bus lines [561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 573, 574, 575]:


  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


In [4]:
nm_stops["geometry"]

1911    POINT (11.46522 49.27316)
1912    POINT (11.46522 49.27324)
1914    POINT (11.47749 49.29389)
1916    POINT (11.47284 49.26636)
1917    POINT (11.47309 49.26627)
                  ...            
2169    POINT (11.43514 49.27684)
2170    POINT (11.43542 49.27681)
2171    POINT (11.48979 49.28614)
2172    POINT (11.52801 49.27605)
2173    POINT (11.49016 49.28449)
Name: geometry, Length: 212, dtype: geometry

In [6]:
import os

# Directory to save parquet files
parquet_dir = "neumarkt_parquet"
os.makedirs(parquet_dir, exist_ok=True)

for name, table in gtfs_tables.items():
    # If the table has a 'geometry' column, extract lat/lon and drop 'geometry'
    if 'geometry' in table.columns:
        # If using GeoPandas, geometry is a shapely Point
        table['lat'] = table['geometry'].apply(lambda g: g.y if g is not None else None)
        table['lon'] = table['geometry'].apply(lambda g: g.x if g is not None else None)
        table = table.drop(columns=['geometry'])
    parquet_path = os.path.join(parquet_dir, name.replace('.csv', '.parquet'))
    table.to_parquet(parquet_path, index=False)
    print(f"Saved {name} as {parquet_path}")


Saved trips.csv as neumarkt_parquet/trips.parquet
Saved stop_times.csv as neumarkt_parquet/stop_times.parquet
Saved stops.csv as neumarkt_parquet/stops.parquet
Saved routes.csv as neumarkt_parquet/routes.parquet
Saved transfers.csv as neumarkt_parquet/transfers.parquet
Saved agency.csv as neumarkt_parquet/agency.parquet
Saved feed_info.csv as neumarkt_parquet/feed_info.parquet
Saved calendar.csv as neumarkt_parquet/calendar.parquet
Saved calendar_dates.csv as neumarkt_parquet/calendar_dates.parquet


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [4]:
import os
import formatting
import csv

# Directory to save CSV files instead of a zip
csv_dir = "neumarkt_gtfs"
os.makedirs(csv_dir, exist_ok=True)

for fname, df in gtfs_tables.items():
    csv_path = os.path.join(csv_dir, fname)
    df = formatting.format_df_for_gtfs(df)
    # Sort columns for stops.txt
    if fname == "stops.txt":
        # Only keep columns that exist in the DataFrame, preserve order, append any extra columns at the end
        ordered_cols = [col for col in stops_column_order if col in df.columns]
        extra_cols = [col for col in df.columns if col not in ordered_cols]
        df = df[ordered_cols + extra_cols]
    df.to_csv(csv_path, index=False, quoting=csv.QUOTE_NONE, encoding='utf-8')
    print(f"Saved {fname} as {csv_path}")

Saved trips.csv as neumarkt_gtfs/trips.csv
Saved stop_times.csv as neumarkt_gtfs/stop_times.csv
Saved stops.csv as neumarkt_gtfs/stops.csv
Saved routes.csv as neumarkt_gtfs/routes.csv
Saved transfers.csv as neumarkt_gtfs/transfers.csv
Saved agency.csv as neumarkt_gtfs/agency.csv
Saved feed_info.csv as neumarkt_gtfs/feed_info.csv
Saved calendar.csv as neumarkt_gtfs/calendar.csv
Saved calendar_dates.csv as neumarkt_gtfs/calendar_dates.csv


In [None]:
# Function to add a new trip to a route using a reference trip and a time offset
def add_trip_with_offset(reference_trip_id, time_offset_minutes, trips_df, stop_times_df):
    """
    Adds a new trip based on a reference trip, shifting all stop_times by a given offset.
    Args:
        reference_trip_id (str): The trip_id to use as a template.
        time_offset_minutes (int): Minutes to add to all times.
        trips_df (pd.DataFrame): The trips table to update.
        stop_times_df (pd.DataFrame): The stop_times table to update.
    Returns:
        (trips_df, stop_times_df): Updated DataFrames.
    """
    import uuid
    from datetime import datetime, timedelta
    # Find the reference trip row
    ref_trip = trips_df[trips_df['trip_id'] == reference_trip_id]
    if ref_trip.empty:
        raise ValueError(f"Reference trip_id {reference_trip_id} not found.")
    # Generate a new unique trip_id
    new_trip_id = str(uuid.uuid4())
    # Copy the trip row and update trip_id
    new_trip = ref_trip.iloc[0].to_dict()
    new_trip['trip_id'] = new_trip_id
    trips_df = pd.concat([trips_df, pd.DataFrame([new_trip])], ignore_index=True)
    # Get stop_times for the reference trip
    ref_stop_times = stop_times_df[stop_times_df['trip_id'] == reference_trip_id].copy()
    def shift_time(t, offset):
        if pd.isna(t): return t
        try:
            dt = datetime.strptime(t, "%H:%M:%S")
            dt_shifted = dt + timedelta(minutes=offset)
            # Handle times that go past midnight (GTFS allows 24:xx:xx etc)
            hours = dt_shifted.hour + (dt_shifted.day - 1) * 24
            return f"{hours:02}:{dt_shifted.minute:02}:{dt_shifted.second:02}"
        except Exception:
            return t
    # Shift all times and assign new trip_id
    new_stop_times = ref_stop_times.copy()
    new_stop_times['trip_id'] = new_trip_id
    for col in ['arrival_time', 'departure_time']:
        if col in new_stop_times.columns:
            new_stop_times[col] = new_stop_times[col].apply(lambda t: shift_time(t, time_offset_minutes))
    stop_times_df = pd.concat([stop_times_df, new_stop_times], ignore_index=True)
    return trips_df, stop_times_df



Index(['stop_id', 'stop_name', 'location_type', 'parent_station', 'geometry'], dtype='object')


0        POINT (10.14257 49.17557)
1        POINT (10.14247 49.17563)
2        POINT (10.12359 49.12538)
3        POINT (10.12366 49.12534)
4        POINT (10.12152 49.12358)
                   ...            
23336    POINT (10.69042 48.95366)
23337     POINT (11.1686 50.35533)
23338    POINT (11.01247 49.32776)
23339    POINT (11.01229 49.32776)
23340    POINT (11.01928 49.33709)
Name: geometry, Length: 23341, dtype: geometry

In [None]:
# Load routes.txt to get the list of bus lines
routes_df = ptg.load_geo_feed(zip_path).routes
bus_lines = routes_df['route_short_name'].unique()
bus_lines = sorted(bus_lines)
print("List of bus lines:")
for line in bus_lines:
    print(line)