In [1]:
import numpy as np
import os
import pyarrow
import sys
import json
import math
import mpl_utils
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import polars as pl
import xml.etree.ElementTree as ET

from xopen import xopen
from datetime import time

In [2]:
# Path to MATSim's experienced plans.
PLAN_PATH = "/Users/andre/Desktop/Cergy/Python_Scripts/runs/fixed_10pct/matsim"

# Path to the directory where the Metropolis output is stored.
METROPOLIS_OUTPUT = "/Users/andre/Desktop/Cergy/Python_Scripts/runs/fixed_10pct/output"

# MATSim network path
NETWORK_PATH = (
    "/Users/andre/Desktop/Cergy/MATSim/matsim-berlin/input/v6.4/berlin-v6.4-network.xml.gz"
)

MATSIM_TRIPS = "/Users/andre/Desktop/Cergy/Python_Scripts/runs/pt_10pct/matsim_trips/"

# Path to MATSim's vehicles.
VEHICLE_PATH = (
    "/Users/andre/Desktop/Cergy/MATSim/matsim-berlin/input/v6.4/berlin-v6.4-vehicleTypes.xml")


In [3]:
def hhmmss_str_to_seconds_expr(col: str) -> pl.Expr:
    return (
        pl.col(col)
        .map_elements(
            lambda t: sum(x * m for x, m in zip(map(int, str(t).split(":")), [3600, 60, 1]))
            if isinstance(t, str) and ":" in t else None,
            return_dtype=pl.Int32
        )
        .alias(f"{col}_secs")
    )

In [4]:
def read_matsim_plans():
    persons = pl.read_parquet(os.path.join(PLAN_PATH, "MATSim_persons.parquet"))
    plans = pl.read_parquet(os.path.join(PLAN_PATH, "MATSim_plans.parquet"))
    activities = pl.read_parquet(os.path.join(PLAN_PATH, "MATSim_activities.parquet"))
    legs = pl.read_parquet(os.path.join(PLAN_PATH, "MATSim_legs.parquet"))
    routes = pl.read_parquet(os.path.join(PLAN_PATH, "MATSim_routes.parquet"))
    return persons, plans, activities, legs, routes

# Create MATSim plan sequence (act-leg-act…) 

In [None]:
def generate_sequence (activities, legs, routes):
    legs = (legs
        .join(routes, how='left', left_on='id', right_on='leg_id')
        .with_columns(pl.col('id').alias('leg_id'),
                      hhmmss_str_to_seconds_expr("dep_time")))

    activities = (activities
                  .drop(["facility", "initialEndTime", "orig_duration"])
                  .with_columns([
                      hhmmss_str_to_seconds_expr("end_time"),
                      hhmmss_str_to_seconds_expr("max_dur")#,
                      #hhmmss_str_to_seconds_expr("trav_time")
                  ])
                 )

        
    
    
    return

In [6]:
legs = (legs
        .join(routes, how='left', left_on='id', right_on='leg_id')
        .with_columns(pl.col('id').alias('leg_id')))

activities = (activities
              .drop(["facility", "initialEndTime", "orig_duration"])
       )

activities = activities.with_columns([
    hhmmss_str_to_seconds_expr("end_time"),
    hhmmss_str_to_seconds_expr("max_dur")
])

legs = legs.with_columns([
    hhmmss_str_to_seconds_expr("dep_time")#,
    #hhmmss_str_to_seconds_expr("trav_time")
])

In [7]:
# pair seq IDs for activities
activities = activities.with_columns([
    ((pl.cum_count("plan_id").over("plan_id") - 1) * 2).alias('seq_index'),
    pl.lit('activity').alias('element_type'),
    pl.col('type').alias('type_or_mode'),
    hhmmss_str_to_seconds_expr("max_dur").cast(pl.Float64).alias("duration"),
    pl.col('link').alias('route'),
    pl.col('link').alias('start_link'),
    pl.col('link').alias('end_link')
])

# odd seq IDs for legs
legs = legs.with_columns([
    ((pl.cum_count("plan_id").over("plan_id") - 1) * 2 + 1).alias('seq_index'),
    pl.lit('leg').alias('element_type'),
    pl.col('mode').alias('type_or_mode'),
    pl.col('trav_time').alias('duration'),
    pl.col('value').alias('route')
                         ])


### Add leg and activity start/end times

In [8]:
activities_secs = activities.select([
    "plan_id",
    ((pl.cum_count("plan_id").over("plan_id") - 1) * 2).alias("seq_index"),
    "end_time_secs",
    "max_dur_secs",
    pl.lit(None).cast(pl.Int32).alias("dep_time_secs"),
    pl.lit(None).cast(pl.Float64).alias("trav_time_secs")
])

In [9]:
legs_secs = legs.select([
    "plan_id",
    ((pl.cum_count("plan_id").over("plan_id") - 1) * 2 + 1).alias("seq_index"),
    pl.lit(None).cast(pl.Int32).alias("end_time_secs"),
    pl.lit(None).cast(pl.Int32).alias("max_dur_secs"),
    "dep_time_secs",
    pl.col("trav_time").alias('trav_time_secs'),
    
])

In [10]:
extra_cols = pl.concat([activities_secs, legs_secs])

## Create the trips data frame

In [11]:
clean_cols = ["plan_id", "seq_index", "element_type", "type_or_mode", "start_link",
              "end_link", "route", "duration"]

activities_clean = activities.select(clean_cols)
legs_clean = legs.select(clean_cols)

In [12]:
matsim_trips = pl.concat([activities_clean, legs_clean]).sort(['plan_id', 'seq_index'])

matsim_trips = matsim_trips.with_columns([
    # Indicate if activity is not interaction
    ((pl.col('element_type') == 'leg'))
    .cast(pl.Int8).alias('is_trip_start')
])

matsim_trips = (matsim_trips
                .with_columns([pl.col('is_trip_start').cum_sum().over('plan_id').alias('trip_id')])
                .drop('is_trip_start')
    )

matsim_trips = matsim_trips.join(extra_cols, on=["plan_id", "seq_index"], how="left")

# Get MATSim trips

## Record start and end times for activities and legs

In [13]:
matsim_trips = matsim_trips.with_columns([
    pl.col("dep_time_secs").shift(1).alias("prev_leg_dep_secs"),
    pl.col("trav_time_secs").shift(1).alias("prev_leg_trav_secs"),
])

# Activity duration
matsim_trips = matsim_trips.with_columns([
    pl.when((pl.col("element_type") == "activity") & pl.col("max_dur_secs").is_not_null())
      .then(pl.col("max_dur_secs"))

    .when((pl.col("element_type") == "activity") &
          pl.col("end_time_secs").is_not_null() &
          pl.col("prev_leg_dep_secs").is_not_null() &
          pl.col("prev_leg_trav_secs").is_not_null())
      .then(pl.col("end_time_secs") - (pl.col("prev_leg_dep_secs") + pl.col("prev_leg_trav_secs")))

    .otherwise(None)
    .alias("activity_duration_secs")
])

In [14]:
# Gather "activity_duration" and "travel_time" into a single variable
matsim_trips = matsim_trips.with_columns([
    pl.when(pl.col("element_type") == "activity")
      .then(pl.col("activity_duration_secs"))
      .when(pl.col("element_type") == "leg")
      .then(pl.col("trav_time_secs"))
      .otherwise(None)
      .alias("duration")
])

# get arrival time for legs
matsim_trips = matsim_trips.with_columns((pl.col('dep_time_secs')+pl.col('duration')).alias('arrival_time'))

In [15]:
# Start times
matsim_trips = matsim_trips.with_columns([
    pl.when(pl.col("element_type") == "leg")
      .then(pl.col("dep_time_secs"))

    .when((pl.col("element_type") == "activity") & pl.col("end_time_secs").is_not_null())
      .then(pl.col("end_time_secs") - pl.col("duration"))

    .when((pl.col("element_type") == "activity") & pl.col("prev_leg_dep_secs").is_not_null())
      .then(pl.col("prev_leg_dep_secs") + pl.col("prev_leg_trav_secs"))

    .otherwise(None)
    .alias("start_time_secs")
])

# End times
matsim_trips = matsim_trips.with_columns([
    pl.when(pl.col("element_type") == "leg")
      .then(pl.col("dep_time_secs") + pl.col("duration"))

    .when(pl.col("element_type") == "activity")
      .then(pl.col("start_time_secs") + pl.col("duration"))

    .otherwise(None)
    .alias("end_time_secs")
])

In [16]:
# Select and rearrange variables
matsim_trips = matsim_trips.select(["plan_id", "trip_id", "seq_index", "element_type", "type_or_mode", 
                                    "start_time_secs", "end_time_secs", "duration", 
                                    "route", "start_link", "end_link"])

In [17]:
matsim_trips = (matsim_trips
        .join(plans.select(['id', 'person_id']), how='left', left_on='plan_id', right_on='id')
               )

## Explore activity types to define tours (i.e. Metro agents)

### Save the activities in a separate data frame

In [19]:
# look for activiy types with an end_time
tour_anchor_types = (
    activities.filter(pl.col("end_time").is_not_null())
    .select("type").unique().to_series().to_list()
)

# Add walking legs to separate walking legs in metropolis
tour_anchor_types = list(set(tour_anchor_types))

# Create a tour flag
matsim_trips = matsim_trips.with_columns([
    pl.col("type_or_mode").is_in(tour_anchor_types)
    .alias("is_tour_anchor")
])

# Create tours
matsim_trips = matsim_trips.with_columns([
    pl.col("is_tour_anchor")
      .cast(pl.Int32)
      .cum_sum()
      .over("plan_id")
      .alias("tour_id")
])

# MATSim trips

## Summarize per Trips

Create metropolis `stopping_time` from actvity duration

In [21]:
stopping_time_df = (
    matsim_trips
    .filter(pl.col("element_type") == "activity")
    .with_columns([
        pl.col("duration").alias("stopping_time")
    ])
    .sort(['plan_id', 'trip_id'])
)

In [23]:
trip_summary = (
    matsim_trips
    .filter(pl.col("element_type") == "leg")
    .rename({"start_time_secs":"start_time",
             "end_time_secs": "end_time",
             "type_or_mode":"mode"
            })
    
    # Rename variables
    .with_columns([

        # Travel_time per trip
        (pl.col("end_time") - pl.col("start_time")).alias("duration")
    ])
    .select([
        "person_id", "plan_id", "tour_id", "trip_id", "seq_index", "mode", "start_time", "end_time", 
        "duration", "route", "start_link", "end_link"
    ])
    .sort(["plan_id", "trip_id", "tour_id"])
)
# Join stopping_time
trip_summary = trip_summary.join(stopping_time_df, on=["plan_id", "trip_id"], how="left")

In [24]:
invalid_starts = (
    trip_summary
    .filter((pl.col("duration") > 86400) | (pl.col("stopping_time") < 0))
    .group_by("plan_id")
    .agg(pl.col("trip_id").min().alias("first_invalid_trip"))
)

trips_cleaned = (
    trip_summary
    .join(invalid_starts, on="plan_id", how="left")
    .filter(
        (pl.col("first_invalid_trip").is_null()) |  
        (pl.col("trip_id") < pl.col("first_invalid_trip"))
    )
    .drop("first_invalid_trip", "duration_right", 'route_right', 'start_link_right', 'end_link_right',
          'person_id_right', 'tour_id_right', 'seq_index')
)

In [25]:
print("Writing files to", MATSIM_TRIPS)
trips_cleaned.write_parquet(os.path.join(MATSIM_TRIPS, "MATSim_trips.parquet"))

Writing files to /Users/andre/Desktop/Cergy/Python_Scripts/runs/pt_10pct/matsim_trips/
