In [1]:
import os
import gzip
import xml.etree.ElementTree as ET
import pandas as pd

In [5]:
def extract_data_from_xml(file_path):
    with gzip.open(file_path, 'rt') as file:
        tree = ET.parse(file)
        root = tree.getroot()
        events = []
        for event in root.findall(".//event[@type='vehicle leaves traffic']"):
            events.append({
                "time": event.get("time"),
                "link": event.get("link"),
                "vehicle": event.get("vehicle")
            })
    return events

In [11]:
def process_directory(directory):
    all_events = {}
    for root_dir, sub_dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xml.gz"):
                print(file)  # add this line
                file_path = os.path.join(root_dir, file)
                seed = file.split('-')[-1][:3]
                scenario = file.split('-')[0]
                all_events[(seed, scenario)] = extract_data_from_xml(file_path)
    return all_events

In [8]:
df = pd.read_csv("arrival_times.csv")

In [17]:
all_events = process_directory("truck_events")

# Create columns if they don't exist
for (seed, scenario), events in all_events.items():
    for index, row in df[(df["Seed"] == int(seed)) & (df["Scenario"] == int(scenario))].iterrows():
        incident_link = row["Incident Link"]
        matching_events = [event for event in events if event["link"] == str(incident_link)]
        
        for i, event in enumerate(matching_events):
            col_time = f"{i+1}st Truck Arrival [HH:MM:SS]"
            col_vehicle = f"{i+1}st vehicle_id"
            
            if i == 1:
                col_time = "2nd Truck Arrival [HH:MM:SS]"
                col_vehicle = "2nd vehicle_id"
            elif i == 2:
                col_time = "3rd Truck Arrival [HH:MM:SS]"
                col_vehicle = "3rd vehicle_id"
            elif i == 3:
                col_time = "4th Truck Arrival [HH:MM:SS]"
                col_vehicle = "4th vehicle_id"
            # You can continue with more elif blocks if you expect more trucks
            
            # Add the columns if they don't exist
            if col_time not in df.columns:
                df[col_time] = ''
                df.insert(df.columns.get_loc(col_time) + 1, col_vehicle, '')  # add vehicle_id right after col_time
                
            df.at[index, col_time] = event["time"]
            df.at[index, col_vehicle] = event["vehicle"]

df.to_csv("arrival_times.csv", index=False)


2-12-141_trucks.events.xml.gz
3-12-141_trucks.events.xml.gz
2-18-167_trucks.events.xml.gz
3-18-167_trucks.events.xml.gz
2-19-227_trucks.events.xml.gz
3-19-227_trucks.events.xml.gz
2-12-340_trucks.events.xml.gz
3-12-340_trucks.events.xml.gz
2-7-398_trucks.events.xml.gz
3-7-398_trucks.events.xml.gz
2-11-418_trucks.events.xml.gz
3-11-418_trucks.events.xml.gz
2-21-472_trucks.events.xml.gz
3-21-472_trucks.events.xml.gz
2-18-499_trucks.events.xml.gz
3-18-499_trucks.events.xml.gz
2-10-518_trucks.events.xml.gz
3-10-518_trucks.events.xml.gz
2-20-584_trucks.events.xml.gz
3-20-584_trucks.events.xml.gz
2-19-637_trucks.events.xml.gz
3-19-637_trucks.events.xml.gz
2-4-723_trucks.events.xml.gz
3-4-723_trucks.events.xml.gz
2-11-790_trucks.events.xml.gz
3-11-790_trucks.events.xml.gz
2-11-847_trucks.events.xml.gz
3-11-847_trucks.events.xml.gz
2-19-879_trucks.events.xml.gz
3-19-879_trucks.events.xml.gz
2-5-886_trucks.events.xml.gz
3-5-886_trucks.events.xml.gz
2-7-907_trucks.events.xml.gz
3-7-907_trucks.ev

In [18]:
def seconds_to_hms(seconds):
    """Convert seconds to HH:MM:SS format."""
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

# Load the CSV file into a DataFrame
df = pd.read_csv("arrival_times.csv")

# Find all columns that contain "Arrival" in their name
arrival_columns = [col for col in df.columns if "Truck Arrival" in col]

# Convert the seconds to HH:MM:SS format
for col in arrival_columns:
    df[col] = df[col].apply(lambda x: seconds_to_hms(x) if pd.notnull(x) else x)

# Save the DataFrame back to the CSV
df.to_csv("arrival_times.csv", index=False)