In [2]:
using Plots, Gurobi, CSV, DataFrames

In [7]:
"""
    load_repo_data(repo_path::String)

Recursively walks through the specified repository path, identifies all `.csv` and `.txt` files
(even in subfolders), and parses them into a Dictionary of DataFrames.

# Arguments
- `repo_path::String`: The local path to the cloned repository.

# Returns
- `Dict{String, DataFrame}`: A dictionary where keys are unique filenames and values are DataFrames.
"""
function load_repo_data(repo_path::String)
    # Dictionary to store the parsed data
    data_store = Dict{String, DataFrame}()
    
    # CSV options (assume headers exist)
    csv_options = (header=true, stringtype=String)

    if !isdir(repo_path)
        @error "Directory not found: $repo_path"
        return data_store
    end

    @info "Recursively scanning directory: $repo_path"
    
    files_found = 0
    
    # walkdir allows us to search subdirectories (e.g., /data, /src)
    for (root, dirs, files) in walkdir(repo_path)
        for file in files
            # Check for valid extensions
            if endswith(lowercase(file), ".csv") || endswith(lowercase(file), ".txt")
                
                files_found += 1
                full_path = joinpath(root, file)
                dataset_name = splitext(file)[1]
                
                # Handle duplicate filenames in different folders by appending parent folder name
                if haskey(data_store, dataset_name)
                    parent_folder = basename(root)
                    dataset_name = "$(parent_folder)_$(dataset_name)"
                end

                try
                    @info "Parsing: $full_path"
                    df = CSV.read(full_path, DataFrame; csv_options...)
                    data_store[dataset_name] = df
                catch e
                    # Only warn, don't crash, if a file is malformed
                    @warn "Skipping $file: Unable to parse as CSV table."
                end
            end
        end
    end

    if files_found == 0
        @warn "No CSV or TXT files were found in $repo_path or its subdirectories."
        @info "Current working directory contains: $(readdir(repo_path))"
    else
        @info "Successfully loaded $(length(data_store)) datasets."
    end

    return data_store
end
"""
    summarize_data(data::Dict{String, DataFrame})

Prints a brief summary of the loaded datasets.
"""
function summarize_data(data::Dict{String, DataFrame})
    println("\n--- Data Summary ---")
    for (name, df) in data
        println("Dataset: '$name'")
        println("  Shape: $(nrow(df)) rows × $(ncol(df)) columns")
        println("  Cols:  $(join(names(df), ", "))")
        println("--------------------")
    end
end

repo_path = "." 

# 2. Load the data

subway_data = load_repo_data(repo_path)

# 3. Print summary
summarize_data(subway_data)



[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mRecursively scanning directory: .
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\MTA_Subway_Hourly_Ridership__Oct_21_2024_Evening.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\MTA_Subway_Hourly_Ridership__Oct_21_2024_Morning.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\MTA_Subway_Stations_20251204.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\agency.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\calendar.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\calendar_dates.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\linecapacity.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\linelength.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\routes.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\shapes.txt
[36m[1m[ [22m[39m[3


--- Data Summary ---
Dataset: 'routes'
  Shape: 29 rows × 10 columns
  Cols:  route_id, agency_id, route_short_name, route_long_name, route_desc, route_type, route_url, route_color, route_text_color, route_sort_order
--------------------
Dataset: 'nodes_with_ridership'
  Shape: 475 rows × 9 columns
  Cols:  node_idx, stop_id, stop_name, stop_lon, stop_lat, station_complex_id, ridership_morning, ridership_evening, net_ridership
--------------------
Dataset: 'stops'
  Shape: 1488 rows × 6 columns
  Cols:  stop_id, stop_name, stop_lat, stop_lon, location_type, parent_station
--------------------
Dataset: 'trips'
  Shape: 20304 rows × 6 columns
  Cols:  route_id, trip_id, service_id, trip_headsign, direction_id, shape_id
--------------------
Dataset: 'MTA_Subway_Aggregated_Ridership_Oct_21_2024_Evening'
  Shape: 424 rows × 2 columns
  Cols:  station_complex_id, ridership
--------------------
Dataset: 'agency'
  Shape: 1 rows × 6 columns
  Cols:  agency_id, agency_name, agency_url, agency_

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\stops.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\transfers.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\datasets\trips.txt
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_graphs\edges_by_route.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_graphs\nodes.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_graphs\nodes_with_ridership.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_graphs\routes.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_graphs\stop_routes.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_turnstile_data\MTA_Subway_Aggregated_Ridership_Oct_21_2024_Evening.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mParsing: .\generated_turnstile_data\MTA_Subway_Aggregated_Ridership_Oct_21_2024_Morning.csv
[36m[1m[ [22m[39m[36m[1mInfo: [22m

In [None]:
using JuMP
using Gurobi

"""
    build_subway_model(V, E_track, E_transfer, L, L_ij; kwargs...) -> model

Builds the Gurobi/JuMP model for the subway optimization problem with:
- track edges E_track
- transfer edges E_transfer
- line set L
- per-edge line sets L_ij

This ONLY builds the optimizer; it does not read any CSVs.
You must construct the sets and parameter dictionaries before calling it.

Arguments
---------
V::Vector{Station}                # stations (any index type: Int, String, etc.)
E_track::Vector{Tuple{Station,Station}}
E_transfer::Vector{Tuple{Station,Station}}
L::Vector{Line}
L_ij::Dict{Tuple{Station,Station},Vector{Line}}

Keyword parameters (all REQUIRED)
---------------------------------
s::Dict{Station,Float64}                             # s_i
t::Dict{Tuple{Station,Station,Line},Float64}         # t_{ijℓ}
t_tr::Dict{Tuple{Station,Station},Float64}           # t^{tr}_{ij}
C_train::Float64                                     # C_train
Δ::Float64                                           # Δ
τ::Dict{Line,Float64}                                # τ_ℓ
energy::Dict{Line,Float64}                           # energy_ℓ
T_max::Float64                                       # T_max
β::Float64                                           # β
γ::Float64                                           # γ
λ::Float64                                           # λ ∈ [0,1]

Optional
--------
shared_track_constraint::Bool = true

Returns
-------
::JuMP.Model (with Gurobi as optimizer)
"""
function build_subway_model(
    V,
    E_track,
    E_transfer,
    L,
    L_ij;
    s,
    t,
    t_tr,
    C_train,
    Δ,
    τ,
    energy,
    T_max,
    β,
    γ,
    λ,
    shared_track_constraint::Bool = true,
)

    # -------------------------
    # Create model
    # -------------------------
    model = Model(Gurobi.Optimizer)

    # -------------------------
    # Index sets for variables
    # -------------------------
    # Triplets (i,j,ℓ) where ℓ actually serves edge (i,j)
    track_triplets = Tuple{eltype(V),eltype(V),eltype(L)}[]
    for (i, j) in E_track
        lines_ij = get(L_ij, (i, j), Vector{eltype(L)}())
        for ℓ in lines_ij
            push!(track_triplets, (i, j, ℓ))
        end
    end

    # -------------------------
    # Decision variables
    # -------------------------

    # x_{ijℓ} ≥ 0 : passenger flow on track edge (i,j) via line ℓ
    @variable(model, x[track_triplets] >= 0)

    # y_{ij} ≥ 0 : passenger flow on transfer edge (i,j)
    @variable(model, y[E_transfer] >= 0)

    # f_ℓ ≥ 0 : train frequency on line ℓ
    @variable(model, f[L] >= 0)

    # overflow_{ijℓ} ≥ 0
    @variable(model, overflow[track_triplets] >= 0)

    # -------------------------
    # Constraints
    # -------------------------

    # (1) Flow conservation at each station i ∈ V
    @constraint(model, [i in V], begin
        # outgoing track flows
        out_track = sum(
            x[(i, j, ℓ)]
            for (ii, j, ℓ) in track_triplets
            if ii == i
        )

        # outgoing transfer flows
        out_transfer = sum(
            y[(i, j)]
            for (u, j) in E_transfer
            if u == i
        )

        # incoming track flows
        in_track = sum(
            x[(k, i, ℓ)]
            for (k, jj, ℓ) in track_triplets
            if jj == i
        )

        # incoming transfer flows
        in_transfer = sum(
            y[(k, i)]
            for (k, v) in E_transfer
            if v == i
        )

        out_track + out_transfer - in_track - in_transfer == s[i]
    end)

    # (2) Capacity constraints on track edges
    @constraint(model, [triplet in track_triplets], begin
        (i, j, ℓ) = triplet
        x[triplet] - overflow[triplet] <= C_train * f[ℓ] * Δ
    end)

    # (3) Optional shared physical track constraints
    #     Sum over lines on each (i,j)
    if shared_track_constraint
        @constraint(model, [e in E_track], begin
            i, j = e
            lines_ij = get(L_ij, (i, j), Vector{eltype(L)}())
            if isempty(lines_ij)
                0.0 <= 0.0   # dummy constraint if edge has no lines
            else
                sum(x[(i, j, ℓ)] - overflow[(i, j, ℓ)] for ℓ in lines_ij) <=
                sum(C_train * f[ℓ] * Δ for ℓ in lines_ij)
            end
        end)
    end

    # (4) Fleet / energy limit:
    @constraint(model,
        sum(f[ℓ] * τ[ℓ] for ℓ in L) <= T_max
    )

    # -------------------------
    # Objective
    # -------------------------

    # Passenger time part:
    # sum_{(i,j)∈E_track, ℓ∈L_ij} t_{ijℓ} x_{ijℓ}
    # + sum_{(i,j)∈E_transfer} t^{tr}_{ij} y_{ij}
    passenger_time_expr =
        sum(t[(i, j, ℓ)] * x[(i, j, ℓ)] for (i, j, ℓ) in track_triplets) +
        sum(t_tr[(i, j)] * y[(i, j)] for (i, j) in E_transfer)

    # Overflow penalty: β * sum overflow_{ijℓ}
    overflow_expr = β * sum(overflow[triplet] for triplet in track_triplets)

    # Energy use: γ * sum energy_ℓ * f_ℓ
    energy_expr = γ * sum(energy[ℓ] * f[ℓ] for ℓ in L)

    @objective(model, Min,
        (1 - λ) * (passenger_time_expr + overflow_expr) +
        λ * energy_expr
    )

    return model
end
