# RT metadata migration

This notebook was used for migrating some old RT metadata. Generally our pattern is to identify all the files then iterate over them, manually mapping from old metadata in dictionary form and using the new metadata class (GTFSRTFeedExtract in this case) to set the new metadata.

In [None]:
# do this first and restart kernel
%pip install pendulum

In [None]:
from calitp_data.storage import get_fs

fs = get_fs()

In [None]:
# configuration
RT_BUCKET = "test-calitp-gtfs-rt-raw"

In [None]:
import gzip
import os
from functools import lru_cache
from typing import Dict

import pendulum
from calitp_data_infra.storage import (
    AirtableGTFSDataExtract,
    AirtableGTFSDataRecord,
    get_latest_file,
)

# os.environ["CALITP_BUCKET__AIRTABLE"] =


@lru_cache
def get_airtable_gtfs_records_for_day(
    dt: pendulum.Date,
) -> Dict[str, AirtableGTFSDataRecord]:
    file = get_latest_file(
        # AirtableGTFSDataExtract.bucket,
        "gs://test-calitp-airtable",
        AirtableGTFSDataExtract.table,
        prefix_partitions={
            "dt": dt,
        },
        partition_types={
            "ts": pendulum.DateTime,
        },
    )

    with get_fs().open(file.name, "rb") as f:
        content = gzip.decompress(f.read())
    records = [
        AirtableGTFSDataRecord(**json.loads(row))
        for row in content.decode().splitlines()
    ]

    return {record.id: record for record in records}


def schedule_record_ids_for_validation_to_actual_url(dt, record_ids):
    if not record_ids:
        return None
    if len(record_ids) == 1:
        records = get_airtable_gtfs_records_for_day(dt)
        return records[record_ids[0]].uri
    raise RuntimeError


len(get_airtable_gtfs_records_for_day(pendulum.Date(2022, 8, 15)))

In [None]:
from google.cloud import storage

client = storage.Client(project="cal-itp-data-infra")
blobs = list(tqdm(client.list_blobs(RT_BUCKET, prefix=f"vehicle_positions/dt=2022-08-20/", delimiter=None)))
len(blobs), blobs[0]

In [None]:
blobs[0].metadata

In [None]:
# Get the list of RT files to update
# from tqdm.notebook import tqdm
from tqdm.auto import tqdm


def get_files_type_day(typ, dt, leave_pbar=True):
    # proto_files = fs.expand_path(f'gs://{RT_BUCKET}/{typ}/dt={dt}', recursive=True)
    files = client.list_blobs(RT_BUCKET, prefix=f"{typ}/dt={dt}/", delimiter=None)
    return [
        file
        for file in tqdm(
            files, desc="Filtering out directories", leave=leave_pbar
        )
        if fs.stat(file)["type"] != "directory"
    ]


import json

PARTITIONED_ARTIFACT_METADATA_KEY = "PARTITIONED_ARTIFACT_METADATA"


def get_metadata(files, leave_pbar=True):
    metadatas = []
    missing_metadata = []
    for file in tqdm(files, desc="Getting metadata", leave=leave_pbar):
        try:
            metadatas.append(
                (file, json.loads(fs.getxattr(file, PARTITIONED_ARTIFACT_METADATA_KEY)))
            )
        except KeyError:
            missing_metadata.append(file)
    return metadatas, missing_metadata

In [None]:
files = handle_type_day(typ="vehicle_positions", dt="2022-08-12")
metadatas, missing = get_metadata(files)
len(files), len(metadatas), len(missing)

In [None]:
files_without_config = [meta for meta in metadatas if "config" not in meta[1]]
files_without_config, missing

In [None]:
(
    sorted(metadatas, key=lambda m: m[1]["ts"], reverse=True)[0],
    set(
        json.dumps(metadata["config"]["auth_query_param"])
        for (file, metadata) in metadatas
    ),
    # set(json.dumps(metadata["config"]["auth_headers"]) for (file, metadata) in metadatas),
)

In [None]:
import pendulum
from calitp_data_infra.storage import (
    PARTITIONED_ARTIFACT_METADATA_KEY,
    GTFSDownloadConfig,
    GTFSRTFeedExtract,
)


def update_metadata(filepath, meta, write=False):
    meta = meta.copy()
    config = meta.pop("config")
    ts = pendulum.parse(meta.pop("ts"), exact=True)
    assert config
    uri = config["uri"]
    if "goswift.ly" in uri:
        headers = {"authorization": "SWIFTLY_AUTHORIZATION_KEY_CALITP"}
    elif "west-hollywood" in uri:
        headers = {"x-umo-iq-api-key": "WEHO_RT_KEY"}
    else:
        headers = {}
    schedule_url = schedule_record_ids_for_validation_to_actual_url(
        dt=ts.date(), record_ids=config["schedule_to_use_for_rt_validation"]
    )
    extract = GTFSRTFeedExtract(
        ts=ts,
        config=GTFSDownloadConfig(
            name=config.get("name"),
            url=uri,
            feed_type=config["data"],
            schedule_url_for_validation=None,
            auth_query_params=config["auth_query_param"],
            auth_headers=headers,
        ),
        **meta,
    )
    if write:
        pass
        # fs.setxattr(**{PARTITIONED_ARTIFACT_METADATA_KEY: extract.json()})


for filepath, meta in tqdm(metadatas, desc="Updating metadatas"):
    update_metadata(filepath, meta, write=False)

In [None]:
typ_pbar = tqdm(["service_alerts", "vehicle_positions", "trip_updates"])
for typ in typ_pbar:
    typ_pbar.set_description(typ)
    raw_dts_pbar = tqdm(
        [fpath.split("/")[-1] for fpath in fs.ls(f"gs://{RT_BUCKET}/{typ}/")],
        leave=False,
    )
    for dt in raw_dts_pbar:
        raw_dts_pbar.set_description(dt)
        _, dt_str = dt.split("=")
        files = get_files_type_day(
            typ="vehicle_positions", dt="2022-08-12", leave_pbar=False
        )
        metadatas, missing = get_metadata(files, leave_pbar=False)
        for filepath, meta in tqdm(metadatas, desc="Updating metadatas", leave=False):
            update_metadata(filepath, meta, write=False)
        raise StopIteration