In [None]:
# import pendulum if we don't have it for some reason; this may be unnecessary now
%pip install pendulum

# Classes & config
classes from https://github.com/cal-itp/data-infra/blob/airtable-extracted-ts/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py

This notebook was used to backfill data in the v2 pipeline by applying a new version of metadata (download config rather than airtable record) and flipping the order of the base64url and ts partitions. We'd changed the download job to use the same ts for all files in a job and decided to use dt/ts/base64url as the partition order.

In [None]:
# configuration / setup
import os
os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"] = "test-calitp-gtfs-schedule-raw-v2"
OLD_SCHEDULE_RAW_BUCKET = "test-calitp-gtfs-schedule-raw"
JSONL_EXTENSION = ".jsonl"

from calitp.storage import get_fs
fs = get_fs()

In [None]:
import pendulum
from calitp.storage import GTFSDownloadConfig, GTFSScheduleFeedExtract, ProcessingOutcome, PartitionedGCSArtifact
from typing import Optional, ClassVar, List
from pydantic import validator

class GTFSDownloadOutcome(ProcessingOutcome):
    config: GTFSDownloadConfig
    extract: Optional[GTFSScheduleFeedExtract]

class DownloadFeedsResult(PartitionedGCSArtifact):
    bucket: ClassVar[str] = "test-calitp-gtfs-schedule-raw-v2"
    table: ClassVar[str] = "download_schedule_feed_results"
    partition_names: ClassVar[List[str]] = ["dt", "ts"]
    ts: pendulum.DateTime
    end: pendulum.DateTime
    outcomes: List[GTFSDownloadOutcome]

    @validator("ts")
    def coerce_ts(cls, v):
        return pendulum.instance(v)
    @validator("end")
    def coerce_end(cls, v):
        return pendulum.instance(v)
    
    @validator("filename", allow_reuse=True)
    def is_jsonl(cls, v):
        assert v.endswith(JSONL_EXTENSION)
        return v

    @property
    def dt(self) -> pendulum.Date:
        return self.ts.date()

    @property
    def successes(self) -> List[GTFSDownloadOutcome]:
        return [outcome for outcome in self.outcomes if outcome.success]

    @property
    def failures(self) -> List[GTFSDownloadOutcome]:
        return [outcome for outcome in self.outcomes if not outcome.success]

    # TODO: I dislike having to exclude the records here
    #   I need to figure out the best way to have a single type represent the "metadata" of
    #   the content as well as the content itself
    def save(self, fs):
        self.save_content(
            fs=fs,
            content="\n".join(o.json() for o in self.outcomes).encode(),
            exclude={"outcomes"},
        )
d = json.loads(fs.getxattr(path="gs://test-calitp-gtfs-schedule-raw/download_schedule_feed_results/dt=2022-09-01/ts=2022-09-01T00:00:26.548709+00:00/results.jsonl", attr="PARTITIONED_ARTIFACT_METADATA"))
DownloadFeedsResult(outcomes=[], **d)

# Schedule

In [None]:
# raw data
from tqdm.notebook import tqdm
results_files = fs.expand_path(f'gs://{OLD_SCHEDULE_RAW_BUCKET}/download_schedule_feed_results/', recursive=True)
results_files = [file for file in results_files if fs.stat(file)["type"] != "directory"]

data_files = fs.expand_path(f'gs://{OLD_SCHEDULE_RAW_BUCKET}/schedule/', recursive=True)
data_files = [file for file in data_files if fs.stat(file)["type"] != "directory"]

results_paths = [(path, *path.split("/")) for path in results_files]
results_paths[0]

In [None]:
import base64
import pendulum
import json
from datetime import datetime
from calitp.storage import GTFSDownloadConfig, GTFSScheduleFeedExtract

# record raw files to be moved
moves = []

# record results objects to be saved
results_to_save = []

# invalid records
drops = {}

pbar = tqdm(results_paths)
for og_path, bucket, table, dt, ts, filename in pbar:
    pbar.set_description(f"processing {dt}")
    
    # checks 
    assert table == "download_schedule_feed_results"
    pdt = pendulum.parse(dt.replace("dt=", ""), exact=True)
    assert isinstance(pdt, pendulum.Date)
    pts = pendulum.parse(ts.replace("ts=", ""), exact=True)
    assert isinstance(pts, pendulum.DateTime)
    
    with fs.open(og_path) as f:
        content = f.read()
    
    new_outcomes = []
    
    new_drops = []
    
    content_pbar = tqdm(content.decode().splitlines(), leave=False)
    # load outcomes rows just as json, converting exception strings to exceptions
    for row in content_pbar:
        content_json = json.loads(row)
        content_json["exception"] = Exception(content_json["exception"]) if content_json["exception"] else None
        
        # if success, we have a file 
        if content_json["success"]:
            
            extract_pts = pendulum.parse(content_json["extract"]["ts"], exact=True)
            assert isinstance(extract_pts, pendulum.DateTime)
            
            assert content_json["extract"]["config"]["uri"] == content_json["airtable_record"]["uri"], f'extract uri {content_json["extract"]["config"]["uri"]} differs from airtable record uri: {content_json["airtable_record"]["uri"]}'

            old_config = content_json["extract"].pop("config")
            old_extract = content_json["extract"]
            base64url = base64.urlsafe_b64encode(old_config["uri"].encode()).decode() 
            old_extract_path = f'gs://{OLD_SCHEDULE_RAW_BUCKET}/schedule/dt={extract_pts.to_date_string()}/base64_url={base64url}/ts={extract_pts.to_iso8601_string()}/{old_extract["filename"]}'
            
            assert fs.exists(old_extract_path), f"error: {old_extract_path} does not exist; mismatch between outcomes and actual files"

            new_config = GTFSDownloadConfig(
                    name = old_config.get("name"),
                    auth_query_params = {old_config.get("authorization_url_parameter_name"): old_config.get("url_secret_key_name")} if old_config.get("authorization_url_parameter_name") else {},
                    auth_headers = {old_config.get("authorization_header_parameter_name"): old_config.get("header_secret_key_name")} if old_config.get("authorization_header_parameter_name") else {},
                    feed_type = old_config.get("data"),
                    url = old_config["uri"],
                    schedule_url_for_validation = None
                    )

            new_extract = GTFSScheduleFeedExtract(
                config = new_config,
                **old_extract
                )

            new_outcome = GTFSDownloadOutcome(
                success = content_json["success"],
                exception = content_json["exception"],
                config = new_config,
                extract = new_extract
            )
            
            new_outcomes.append(new_outcome)
            moves.append((old_extract_path, f'gs://{new_extract.path}', new_extract))
            
        else:
            old_config = content_json["airtable_record"]
            
            try:
                new_config = GTFSDownloadConfig(
                    name = old_config.get("name"),
                    auth_query_params = {old_config.get("authorization_url_parameter_name"): old_config.get("url_secret_key_name")} if old_config.get("authorization_url_parameter_name") else {},
                    auth_headers = {old_config.get("authorization_header_parameter_name"): old_config.get("header_secret_key_name")} if old_config.get("authorization_header_parameter_name") else {},
                    feed_type = old_config.get("data"),
                    url = old_config["uri"],
                    schedule_url_for_validation = None
                    )
                
                new_outcome = GTFSDownloadOutcome(
                    success = content_json["success"],
                    exception = content_json["exception"],
                    config = new_config
                    )
                
                new_outcomes.append(new_outcome)
                
            except Exception as e:
                new_drops.append(content_json) 
            
    
    len_outcomes = len(new_outcomes)
    len_drops = len(new_drops)
    len_content = len(content.decode().splitlines())
    assert len_outcomes + len_drops == len_content, f"got {len_outcomes} outcomes and {len_drops} drops from {len_content} input records"
    new_results = DownloadFeedsResult(outcomes=new_outcomes, **json.loads(fs.getxattr(path=f"gs://{og_path}", attr="PARTITIONED_ARTIFACT_METADATA")))
    if len_drops:
        drops[pdt] = new_drops
    results_to_save.append(new_results)

len_moves = len(moves)
len_data_files = len(data_files)
assert len_data_files == len_data_files, f"got {len_moves} from {len_data_files}"

In [None]:
moves[0:2]

In [None]:
results_to_save[0].ts, len(results_to_save[0].outcomes)

In [None]:
len(moves), len(results_to_save)

In [None]:
for og_path, new_path, new_extract in tqdm(moves):
    #fs.cp(og_path, new_path)
    #fs.setxattrs(path=new_path, PARTITIONED_ARTIFACT_METADATA=new_extract.json())


In [None]:
for result in tqdm(results_to_save):
    result.save(fs)