In [None]:
# do this first and restart kernel
# need version of the GTFSDownloadConfig class that has optional config.extracted_at
%pip install calitp=="2022.9.13a0"
%pip install pendulum

# Classes & config
classes from https://github.com/cal-itp/data-infra/blob/airtable-extracted-ts/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py

In [None]:
# configuration
import os
os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"] = "test-calitp-gtfs-schedule-raw-v2"
OLD_SCHEDULE_RAW_BUCKET = "test-calitp-gtfs-schedule-raw"
SCHEDULE_RAW_BUCKET = "test-calitp-gtfs-schedule-raw-v2"
JSONL_EXTENSION = ".jsonl"

from calitp.storage import get_fs
fs = get_fs()

In [None]:
import pendulum
from calitp.storage import GTFSDownloadConfig, GTFSScheduleFeedExtract, ProcessingOutcome, PartitionedGCSArtifact
from typing import Optional, ClassVar, List
from pydantic import validator

class GTFSDownloadOutcome(ProcessingOutcome):
    config: GTFSDownloadConfig
    extract: Optional[GTFSScheduleFeedExtract]

class DownloadFeedsResult(PartitionedGCSArtifact):
    bucket: ClassVar[str] = OLD_SCHEDULE_RAW_BUCKET
    table: ClassVar[str] = "download_schedule_feed_results"
    partition_names: ClassVar[List[str]] = ["dt", "ts"]
    ts: pendulum.DateTime
    end: pendulum.DateTime
    outcomes: List[GTFSDownloadOutcome]

    @validator("filename", allow_reuse=True)
    def is_jsonl(cls, v):
        assert v.endswith(JSONL_EXTENSION)
        return v

    @property
    def dt(self) -> pendulum.Date:
        return self.ts.date()

    @property
    def successes(self) -> List[GTFSDownloadOutcome]:
        return [outcome for outcome in self.outcomes if outcome.success]

    @property
    def failures(self) -> List[GTFSDownloadOutcome]:
        return [outcome for outcome in self.outcomes if not outcome.success]

    # TODO: I dislike having to exclude the records here
    #   I need to figure out the best way to have a single type represent the "metadata" of
    #   the content as well as the content itself
    def save(self, fs):
        self.save_content(
            fs=fs,
            content="\n".join(o.json() for o in self.outcomes).encode(),
            exclude={"outcomes"},
        )

# Schedule

In [None]:
# raw data
from tqdm.notebook import tqdm
results_files = fs.expand_path(f'gs://{OLD_SCHEDULE_RAW_BUCKET}/download_schedule_feed_results/', recursive=True)
results_files = [file for file in results_files if fs.stat(file)["type"] != "directory"]

data_files = fs.expand_path(f'gs://{OLD_SCHEDULE_RAW_BUCKET}/schedule/', recursive=True)
data_files = [file for file in data_files if fs.stat(file)["type"] != "directory"]

results_paths = [(path, *path.split("/")) for path in results_files]
data_paths = [(path, *path.split("/")) for path in data_files]

In [None]:
# TODO: use the actual calitp type -- requires pip installing newest calitp version 
# construct an extract with the new config type from the existing metadata

# make a dict mapping ts+airtable record to b64url -- use this to link extract with outcome to populate config.url

# schedule outcomes files 

import base64
import pendulum
import json
from datetime import datetime
from calitp.storage import GTFSDownloadConfig, GTFSScheduleFeedExtract

# old path, new path, new extract (i.e., new metadata)
moves = []

# map ts & airtable record to base64 url 
airtable_record_mapper = {}

# new results objects
results_to_save = []

for og_path, bucket, table, dt, base64url, ts, filename in data_paths:
    old_extract = json.loads(fs.getxattr(path=f"gs://{og_path}", attr="PARTITIONED_ARTIFACT_METADATA"))
    
    old_config = old_extract.pop("config")
    
    new_config = GTFSDownloadConfig(
        name = old_config.get("name"),
        auth_query_params = {old_config.get("authorization_url_parameter_name"): old_config.get("url_secret_key_name")} if old_config.get("authorization_url_parameter_name") else {},
        auth_headers = {old_config.get("authorization_header_parameter_name"): old_config.get("header_secret_key_name")} if old_config.get("authorization_header_parameter_name") else {},
        feed_type = old_config.get("data"),
        url = base64.urlsafe_b64decode(base64url.replace("base64_url=","")).decode(),
        schedule_url_for_validation = None
        )
    
    new_extract = GTFSScheduleFeedExtract(
        config = new_config,
        **old_extract
        )
    
    moves.append((og_path, new_entity.path, new_config, new_extract))
    airtable_record_mapper[old_config["id"]] = (new_config, new_extract)
    
        



In [None]:
for og_path, new_path in tqdm(moves):
    #print(og_path, new_path)
    #break
    fs.mv(og_path, new_path)