In [None]:
import pandas as pd
import numpy as np
import re


In [None]:

def clean_ems(ems_path):
    """
    Clean EMS dispatch dataset.
    Returns a dataframe with one row per EMS event, ready for simulation.
    """

    ems = pd.read_csv(ems_path, dtype=str)

    dt_cols = ["Dispatch DtTm", "Response DtTm", "On Scene DtTm",
               "Received DtTm", "Entry DtTm", "Call Date"]
    for col in dt_cols:
        if col in ems.columns:
            ems[col] = pd.to_datetime(ems[col], errors="coerce")

    def parse_lon(s):
        if isinstance(s, str) and s.startswith("POINT"):
            try:
                return float(s.split("(")[1].split(" ")[0])
            except Exception:
                return np.nan
        return np.nan

    def parse_lat(s):
        if isinstance(s, str) and s.startswith("POINT"):
            try:
                return float(s.split("(")[1].split(" ")[1].replace(")", ""))
            except Exception:
                return np.nan
        return np.nan

    ems["lon"] = ems["case_location"].apply(parse_lon)
    ems["lat"] = ems["case_location"].apply(parse_lat)

   
    if "Unit Type" in ems.columns:
        ems["Unit Type_upper"] = ems["Unit Type"].str.upper()
        mask_unit = ems["Unit Type_upper"].str.contains(
            "MEDIC|AMBUL|ALS|RESCUE", na=False
        )
        ems = ems[mask_unit].copy()
        ems.drop(columns=["Unit Type_upper"], inplace=True)

    if "Call Type" in ems.columns:
        ems = ems[ems["Call Type"].str.contains("Medical Incident", na=False)]

    ems = ems.dropna(subset=["Dispatch DtTm", "lat", "lon"])

    cols = [
        "Call Number",
        "Incident Number",
        "Call Type",
        "Call Type Group",
        "Unit Type",
        "Dispatch DtTm",
        "Response DtTm",
        "On Scene DtTm",
        "Priority",
        "Original Priority",
        "City",
        "Address",
        "lat",
        "lon",
    ]
    cols = [c for c in cols if c in ems.columns]

    ems_events = ems[cols].copy()

    ems_events = ems_events.reset_index(drop=True)
    ems_events["event_id"] = ems_events.index

    return ems_events


In [None]:
ems_events = clean_ems("../data/1_full_dataset/ems_logs.csv")
ems_events.to_csv('cleaned_emergency_logs.csv')


In [None]:


def clean_pems_metadata(meta_path):
    """
    Clean PeMS station metadata file.
    Uses your actual columns: 'ID','Fwy','Dir','District','Latitude','Longitude','Type','Lanes','Name','City'
    Returns only mainline stations in District 4 with valid coords.
    """

    meta = pd.read_csv(meta_path, dtype=str)

    col_map = {
        "Fwy": "Freeway",
        "Dir": "Freeway Direction",
        "District": "District",
        "Latitude": "Latitude",
        "Longitude": "Longitude",
        "Type": "Type",
        "Lanes": "Lanes",
        "Name": "Name",
        "City": "City",
    }

    for raw, std in col_map.items():
        if raw in meta.columns and std not in meta.columns:
            meta[std] = meta[raw]

    for col in ["Latitude", "Longitude", "Lanes", "District"]:
        if col in meta.columns:
            meta[col] = pd.to_numeric(meta[col], errors="coerce")

  
    meta = meta.dropna(subset=["Latitude", "Longitude"])

   
    if "District" in meta.columns:
        meta = meta[meta["District"] == 4]

    if "Type" in meta.columns:
        meta = meta[meta["Type"] == "ML"]

    # Final columns to keep
    wanted = [
        "ID",
        "Freeway",
        "Freeway Direction",
        "District",
        "Latitude",
        "Longitude",
        "Type",
        "Lanes",
        "Name",
        "City",
    ]
    keep = [c for c in wanted if c in meta.columns]

    meta_clean = meta[keep].reset_index(drop=True)

    return meta_clean


In [None]:
meta_data = clean_pems_metadata('../data/1_raw_data/d04_text_meta_2025_01_15.txt')
meta_data.to_csv('../data/3_clean_dataset/cleaned_station_metadata.csv')

In [None]:


BASE_COLS = [
    "Timestamp",
    "Station",
    "District",
    "Freeway",
    "Direction of Travel",
    "Lane Type",
    "Station Length",
    "Samples",
    "% Observed",
    "Total Flow",
    "Avg Occupancy",
    "Avg Speed",
]

def clean_pems_day_txt(fp):
    """
    Clean ONE PeMS station_5min daily txt file.
    - Reads as comma-separated
    - Renames leading columns to standard names
    - Parses Timestamp
    - Converts key numeric columns
    - Adds file_date
    """

    df = pd.read_csv(fp, sep=",", dtype=str)

    # Rename leading columns
    rename_map = {}
    for i, col in enumerate(df.columns):
        if i < len(BASE_COLS):
            rename_map[col] = BASE_COLS[i]
    df = df.rename(columns=rename_map)

    # Parse timestamp
    if "Timestamp" in df.columns:
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")


    num_cols = ["Station Length", "Samples", "% Observed", "Total Flow",
                "Avg Occupancy", "Avg Speed"]
    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    name = os.path.basename(fp).replace(".txt", "")
    parts = name.split("_")
    y, m, d = map(int, parts[-3:])
    df["file_date"] = date(y, m, d)

    if "Station" in df.columns:
        df = df.dropna(subset=["Timestamp", "Station"])
    else:
        df = df.dropna(subset=["Timestamp"])

    df = df.drop_duplicates()

    return df


In [None]:
import glob

DATA_DIR = "../data/1_raw_data"
txt_files = sorted(glob.glob(DATA_DIR + "/d04_text_station_5min_2025_*.txt"))

pems_days_clean = []
bad_files = []

for fp in txt_files:
    try:
        day_df = clean_pems_day_txt(fp)
        pems_days_clean.append(day_df)
        print("cleaned:", os.path.basename(fp), "rows:", len(day_df))
    except Exception as e:
        bad_files.append((fp, repr(e)))
        print("skip:", os.path.basename(fp), "->", repr(e))

print("bad files:", len(bad_files))
