## Gather Data

In [147]:
import requests, io
import pandas as pd
import zipfile
from gurobipy import Model, GRB, quicksum
from IPython.display import clear_output
from collections import defaultdict
import math

In [None]:


ridership_df = pd.DataFrame()

links = [
    'https://www.bart.gov/sites/default/files/2025-11/Ridership_202510.xlsx',
    'https://www.bart.gov/sites/default/files/2025-10/Ridership_202509.xlsx',
    'https://www.bart.gov/sites/default/files/2025-10/Ridership_202508.xlsx',
    'https://www.bart.gov/sites/default/files/2025-08/Ridership_202507.xlsx',
    'https://www.bart.gov/sites/default/files/2025-07/Ridership_202506.xlsx',
    'https://www.bart.gov/sites/default/files/2025-06/Ridership_202505.xlsx',
    'https://www.bart.gov/sites/default/files/2025-05/Ridership_202504.xlsx',
    'https://www.bart.gov/sites/default/files/2025-05/Ridership_202503.xlsx',
    'https://www.bart.gov/sites/default/files/2025-03/Ridership_202502.xlsx',
    'https://www.bart.gov/sites/default/files/2025-02/Ridership_202501.xlsx'
]

def extract_date_from_link(link):
    """
    Extract substring between 'Ridership_' and '.xlsx'.
    Example: 'Ridership_202510.xlsx' → '202510'
    """
    start = link.find("Ridership_") + len("Ridership_")
    end = link.find(".xlsx")
    return link[start:end]

def set_header_row(df, keyword="Exit Station Two-Letter Code"):
    """
    Find the row containing keyword and use it as the column headers.
    Return the cleaned DataFrame.
    """
    # locate the header row index
    header_idx = df.index[df.apply(lambda row: row.astype(str).str.contains(keyword).any(), axis=1)]
    
    if len(header_idx) == 0:
        # no header row found — return unchanged
        return df
    
    header_idx = header_idx[0]

    # set the header
    new_header = df.iloc[header_idx]
    df = df[header_idx + 1 :]  # drop the header row and all above it
    df.columns = new_header

    return df.reset_index(drop=True)


for link in links:
    response = requests.get(link)

    df = pd.read_excel(
        io.BytesIO(response.content),
        engine='openpyxl',
        header=None,                    
        sheet_name='Average Weekday'
    )

    df = set_header_row(df)

    # Add extracted date
    df["date"] = extract_date_from_link(link)            

    ridership_df = pd.concat([ridership_df, df], ignore_index=True)


In [99]:
# https://www.bart.gov/sites/default/files/docs/station-names.xls

station_map_df = pd.DataFrame({
    "code": [
        "RM","EN","EP","NB","BK","AS","MA","19","12","LM","FV","CL","SL","BF","HY","SH",
        "UC","FM","CN","PH","WC","LF","OR","RR","OW","EM","MT","PL","CC","16","24","GP",
        "BP","DC","CM","CV","ED","NC","WP","SS","SB","SO","MB","WD","OA","WS","AN","PC",
        "ML","BE"
    ],
    "stop_name": [
        "Richmond","El Cerrito Del Norte","El Cerrito Plaza","North Berkeley",
        "Downtown Berkeley","Ashby","MacArthur","19th Street Oakland",
        "12th Street / Oakland City Center","Lake Merritt","Fruitvale","Coliseum - OAC",
        "San Leandro","Bay Fair","Hayward","South Hayward","Union City","Fremont",
        "Concord","Pleasant Hill / Contra Costa Centre","Walnut Creek","Lafayette","Orinda","Rockridge",
        "West Oakland","Embarcadero","Montgomery Street","Powell Street",
        "Civic Center / UN Plaza","16th Street / Mission","24th Street / Mission","Glen Park",
        "Balboa Park","Daly City","Colma","Castro Valley","Dublin / Pleasanton",
        "North Concord / Martinez","Pittsburg / Bay Point","South San Francisco","San Bruno",
        "San Francisco International Airport","Millbrae (Caltrain Transfer Platform)","West Dublin / Pleasanton",
        "Oakland International Airport Station","Warm Springs / South Fremont","Antioch","Pittsburg Center",
        "Milpitas","Berryessa / North San Jose"
    ]
})

In [100]:
GTFS_URL = "https://www.bart.gov/dev/schedules/google_transit.zip"

response = requests.get(GTFS_URL)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Load GTFS components into DataFrames
routes = pd.read_csv(z.open("routes.txt"))
stops = pd.read_csv(z.open("stops.txt"))
trips = pd.read_csv(z.open("trips.txt"), dtype={"route_id": str})
stop_times = pd.read_csv(z.open("stop_times.txt"))
calendar = pd.read_csv(z.open("calendar.txt"))

# Join trips with routes
trip_routes = trips.merge(routes, on="route_id", how="left")

# Join stop_times with stops
times_with_stops = stop_times.merge(stops, on="stop_id", how="left")

# Final joined table
full_schedule = (
    trip_routes
    .merge(times_with_stops, on="trip_id", how="left")
    .sort_values(["route_id", "trip_id", "stop_sequence"])
)

full_schedule['stop_name'] = full_schedule['stop_name'].str.replace("", "", regex=False)

mapped_schedule = full_schedule.merge(station_map_df, left_on="stop_name", right_on="stop_name", how="left")

# mapped_schedule.to_excel("C:/Users/chhri/Downloads/bart_full_schedule.xlsx")

In [150]:
df = ridership_df.rename(columns={'Exit Station Two-Letter Code': 'from_stop'})

# 2. Melt all OD columns into long form
df_long = df.melt(
    id_vars=['date', 'from_stop'],     # keep these as identifiers
    var_name='to_stop',                # new column representing destination
    value_name='ridership'             # ridership value
)

# 3. Drop rows where to_stop is NaN (your last row has a NaN column)
df_long = df_long.dropna(subset=['to_stop'])

# Optional: sort nicely
df_long = df_long.sort_values(['date', 'from_stop', 'to_stop']).reset_index(drop=True)

df_avg = (
    df_long
    .groupby(['from_stop', 'to_stop'], as_index=False)['ridership']
    .mean()
    .rename(columns={'ridership': 'avg_ridership'})
)

# Round up avg ridership to nearest integer and handle missing values
df_avg['avg_ridership'] = df_avg['avg_ridership'].apply(lambda x: int(math.ceil(x)) if pd.notna(x) else 0)

df_avg.head()

Unnamed: 0,from_stop,to_stop,avg_ridership
0,12,12,37
1,12,16,182
2,12,19,23
3,12,24,144
4,12,AN,91


In [152]:
stops_df = mapped_schedule.copy()

def parse_gtfs_time(t):
    """
    Parse time strings like '06:11:00' or '24:05:00' into minutes since midnight.
    GTFS allows hours >= 24, meaning times past midnight but same service day.
    """
    if pd.isna(t):
        return np.nan
    hh, mm, ss = map(int, str(t).split(":"))
    return hh * 60 + mm + ss / 60.0   # minutes since midnight (possibly > 24*60)

def compute_travel_times(stops_df):
    df = stops_df.copy()

    # parse times into numeric minutes
    df["arr_min"] = df["arrival_time"].apply(parse_gtfs_time)
    df["dep_min"] = df["departure_time"].apply(parse_gtfs_time)

    # sort properly
    df = df.sort_values(["route_id", "trip_id", "stop_sequence"])

    arc_rows = []

    for (route_id, trip_id), group in df.groupby(["route_id", "trip_id"]):
        group = group.sort_values("stop_sequence")

        dep_times = group["dep_min"].values
        arr_times = group["arr_min"].values
        seq      = group["stop_sequence"].values
        codes    = group["code"].values
        dep_str  = group["departure_time"].values
        arr_str  = group["arrival_time"].values

        for i in range(len(group) - 1):
            travel = arr_times[i+1] - dep_times[i]   # minutes
            travel_min = int(round(travel))

            arc_rows.append({
                "route_id": route_id,
                "trip_id": trip_id,
                "from_stop": codes[i],
                "to_stop": codes[i+1],
                "from_seq": seq[i],
                "to_seq": seq[i+1],
                "dep_time_str": dep_str[i],
                "arr_time_str": arr_str[i+1],
                "travel_time_min": travel_min
            })
    df = pd.DataFrame(arc_rows)

    arc_rows = df.merge(df_avg, how='left', on=['from_stop','to_stop'])

    return pd.DataFrame(arc_rows)

arc_df = compute_travel_times(stops_df)

def build_nodes(arc_df):
    # each "to" event defines a station–time node
    nodes = arc_df["to_stop"] + "_" + arc_df["arr_time_str"]
    V = list(nodes.unique())
    return V

V = build_nodes(arc_df)

def build_stage_arcs(arc_df):
    A_stage = []
    for row in arc_df.itertuples(index=False):
        arc_name = f"{row.from_stop}_{row.dep_time_str}__{row.to_stop}_{row.arr_time_str}"
        A_stage.append({
            "arc": arc_name,
            "from_node": row.from_stop,
            "depart_time": row.dep_time_str,
            "to_node":   row.to_stop,
            "arrive_time": row.arr_time_str,
            "avg_ridership": row.avg_ridership,
            "travel_time": row.travel_time_min,
            "trip_id":     row.trip_id
        })
    return A_stage

A_stage = build_stage_arcs(arc_df)

# index ridership by origin
ridership_df_idx = ridership_df.set_index("Exit Station Two-Letter Code")

def build_projected_ridership_dict(ridership_df_idx, growth_rate=1.0):
    """
    Returns a dict: projected_lookup[(origin, dest)] = scalar ridership.
    Applies an optional growth factor.
    """
    clean_index = ridership_df_idx.index
    clean_index = clean_index[~clean_index.isin(["Grand Total"])]
    clean_index = clean_index.dropna()

    clean_cols = ridership_df_idx.columns
    clean_cols = clean_cols[~clean_cols.isin(["Grand Total", "date"])]
    clean_cols = clean_cols.dropna()

    projected_lookup = {}

    for origin in clean_index:
        for dest in clean_cols:
            vals = ridership_df_idx.loc[origin, dest]

            if isinstance(vals, pd.Series):
                vals = vals.dropna()
                avg_ridership = vals.mean()
            else:
                avg_ridership = float(vals) if pd.notna(vals) else 0.0

            projected = avg_ridership * growth_rate
            projected_lookup[(origin, dest)] = float(projected)

    return projected_lookup

projected_lookup = build_projected_ridership_dict(ridership_df_idx, growth_rate=1.0)

def compute_demand_for_arcs(A_stage, projected_lookup, stops_df):
    """
    Compute scalar demand d_a for each arc using projected OD demand.
    d_a = sum of riders who board at origin and exit at any downstream stop
          along that trip.
    """
    # Precompute stop order for each trip
    stop_orders = {}
    for trip_id, group in stops_df.groupby("trip_id"):
        group_sorted = group.sort_values("stop_sequence")
        stop_orders[trip_id] = list(group_sorted["code"])

    demand_dict = {}

    for arc in A_stage:
        trip_id = arc["trip_id"]
        origin = arc["from_node"]
        dest   = arc["to_node"]
        stops = stop_orders[trip_id]
        idx_origin = stops.index(origin)
        idx_dest   = stops.index(dest)

        downstream = stops[idx_dest:]   # dest and all following stops

        d_sum = 0.0
        for down_station in downstream:
            if not isinstance(down_station, str):
                continue
            key = (origin, down_station)
            d_sum += projected_lookup.get(key, 0.0)

        demand_dict[arc["arc"]] = float(d_sum)

    return demand_dict

d = compute_demand_for_arcs(A_stage, projected_lookup, stops_df)

# Remove duplicates based on the 'arc' field
seen = set()
A_stage_unique = []

for a in A_stage:
    if a["arc"] not in seen:
        A_stage_unique.append(a)
        seen.add(a["arc"])

# Build node set and incidence lists
nodes = set()
incoming = defaultdict(list)  # incoming[v] = list of arc names that end at v
outgoing = defaultdict(list)  # outgoing[v] = list of arc names that start at v

for arc in A_stage_unique:
    a_name = arc["arc"]
    u = arc["from_node"]
    v = arc["to_node"]

    nodes.add(u)
    nodes.add(v)

    outgoing[u].append(a_name)
    incoming[v].append(a_name)

In [None]:

model = Model("Bart_Fleet_Optimization")

# six cars per train, 54 seats per car.
# t[a] = integer number of train units assigned to arc a
t = model.addVars(
    [arc["avg_ridership"] for arc in A_stage_unique],
    vtype=GRB.INTEGER,
    lb=0,
    name="t"
)

# Minimize the amount of cars
model.setObjective(
    quicksum(arc["avg_ridership"]/54 for arc in A_stage_unique),
    GRB.MINIMIZE,
    lb=1
)

# fleet-style objective (unweighted)
model.setObjective(
    quicksum(t[arc["arc"]] for arc in A_stage_unique),
    GRB.MINIMIZE
)

# Flow conservation constraints
for v in nodes:
    in_arcs = incoming.get(v, [])
    out_arcs = outgoing.get(v, [])

    # Skip boundary nodes (no in or no out)
    if len(in_arcs) == 0 or len(out_arcs) == 0:
        continue

    model.addConstr(
        quicksum(t[a] for a in in_arcs) == quicksum(t[a] for a in out_arcs),
        name=f"flow_{v}"
    )

m_seats_per_unit = 163  # seats per car / unit
n_max_units      = 5    # max units per train
seats_per_train = m_seats_per_unit * n_max_units

for arc in A_stage_unique:
    name = arc["arc"]
    model.addConstr(
        seats_per_train * t[name] >= d[name],
        name=f"demand_{name}"
    )

model.optimize()

solution = {arc["arc"]: t[arc["arc"]].X for arc in A_stage_unique}

Gurobi Optimizer version 12.0.3 build v12.0.3rc0 (win64 - Windows 11.0 (26100.2))

CPU model: 13th Gen Intel(R) Core(TM) i7-13700K, instruction set [SSE2|AVX|AVX2]
Thread count: 16 physical cores, 24 logical processors, using up to 24 threads

Optimize a model with 22651 rows, 14519 columns and 32882 nonzeros
Model fingerprint: 0x8c62ce51
Variable types: 0 continuous, 14519 integer (0 binary)
Coefficient statistics:
  Matrix range     [1e+00, 8e+02]
  Objective range  [1e+00, 1e+00]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+01, 7e+03]
Presolve removed 343 rows and 38 columns
Presolve time: 0.00s

Explored 0 nodes (0 simplex iterations) in 0.01 seconds (0.01 work units)
Thread count was 1 (of 24 available processors)

Solution count 0

Model is infeasible
Best objective -, best bound -, gap -


AttributeError: Unable to retrieve attribute 'X'

In [153]:
A_stage_unique

[{'arc': 'AN_05:48:00__PC_05:55:00',
  'from_node': 'AN',
  'depart_time': '05:48:00',
  'to_node': 'PC',
  'arrive_time': '05:55:00',
  'avg_ridership': 28.0,
  'travel_time': 7,
  'trip_id': 1754788},
 {'arc': 'PC_05:55:00__WP_06:04:00',
  'from_node': 'PC',
  'depart_time': '05:55:00',
  'to_node': 'WP',
  'arrive_time': '06:04:00',
  'avg_ridership': 10.0,
  'travel_time': 9,
  'trip_id': 1754788},
 {'arc': 'WP_06:05:00__NC_06:11:00',
  'from_node': 'WP',
  'depart_time': '06:05:00',
  'to_node': 'NC',
  'arrive_time': '06:11:00',
  'avg_ridership': 21.0,
  'travel_time': 6,
  'trip_id': 1754788},
 {'arc': 'NC_06:11:00__CN_06:15:00',
  'from_node': 'NC',
  'depart_time': '06:11:00',
  'to_node': 'CN',
  'arrive_time': '06:15:00',
  'avg_ridership': 27.0,
  'travel_time': 4,
  'trip_id': 1754788},
 {'arc': 'CN_06:15:00__PH_06:20:00',
  'from_node': 'CN',
  'depart_time': '06:15:00',
  'to_node': 'PH',
  'arrive_time': '06:20:00',
  'avg_ridership': 63.0,
  'travel_time': 5,
  'trip_