In [5]:
# do this first and restart kernel
# need version of the GTFSDownloadConfig class that has optional config.extracted_at
%pip install calitp=="2022.9.13a0"
%pip install pendulum

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
from calitp.storage import get_fs
fs = get_fs()

# Classes & config
classes from https://github.com/cal-itp/data-infra/blob/airtable-extracted-ts/airflow/dags/download_gtfs_schedule_v2/download_schedule_feeds.py

In [7]:
# configuration
OLD_SCHEDULE_RAW_BUCKET = "test-calitp-gtfs-schedule-raw"
SCHEDULE_RAW_BUCKET = "test-calitp-gtfs-schedule-raw-v2"
JSONL_EXTENSION = ".jsonl"

In [8]:
import pendulum
from calitp.storage import GTFSDownloadConfig, GTFSScheduleFeedExtract, ProcessingOutcome, PartitionedGCSArtifact
from typing import Optional, ClassVar, List
from pydantic import validator

class GTFSDownloadOutcome(ProcessingOutcome):
    config: GTFSDownloadConfig
    extract: Optional[GTFSScheduleFeedExtract]

class DownloadFeedsResult(PartitionedGCSArtifact):
    bucket: ClassVar[str] = OLD_SCHEDULE_RAW_BUCKET
    table: ClassVar[str] = "download_schedule_feed_results"
    partition_names: ClassVar[List[str]] = ["dt", "ts"]
    ts: pendulum.DateTime
    end: pendulum.DateTime
    outcomes: List[GTFSDownloadOutcome]

    @validator("filename", allow_reuse=True)
    def is_jsonl(cls, v):
        assert v.endswith(JSONL_EXTENSION)
        return v

    @property
    def dt(self) -> pendulum.Date:
        return self.ts.date()

    @property
    def successes(self) -> List[GTFSDownloadOutcome]:
        return [outcome for outcome in self.outcomes if outcome.success]

    @property
    def failures(self) -> List[GTFSDownloadOutcome]:
        return [outcome for outcome in self.outcomes if not outcome.success]

    # TODO: I dislike having to exclude the records here
    #   I need to figure out the best way to have a single type represent the "metadata" of
    #   the content as well as the content itself
    def save(self, fs):
        self.save_content(
            fs=fs,
            content="\n".join(o.json() for o in self.outcomes).encode(),
            exclude={"outcomes"},
        )

# Schedule

In [9]:
# raw data
from tqdm.notebook import tqdm
results_files = fs.expand_path(f'gs://{OLD_SCHEDULE_RAW_BUCKET}/download_schedule_feed_results/', recursive=True)
results_files = [file for file in results_files if fs.stat(file)["type"] != "directory"]
len(results_files)

27

In [17]:
# TODO: use the actual calitp type -- requires pip installing newest calitp version 
# construct an extract with the new config type from the existing metadata

# make a dict mapping ts+airtable record to b64url -- use this to link extract with outcome to populate config.url

# schedule outcomes files 

import base64
import pendulum
import json
from datetime import datetime
from calitp.storage import GTFSDownloadConfig, GTFSScheduleFeedExtract

# old path, new path, new metadata
moves = []

# just a list of results objects
results_to_save = []

for results_file in tqdm(results_files):
    with fs.open(results_file) as f:
        content = f.read()
        
    # load outcomes rows just as json, converting exception strings to exceptions since this is required in both versions 
    outcomes_json = []
    for row in content.decode().splitlines():
        content_json = {**json.loads(row)}
        if content_json["exception"]:
            content_json["exception"] = Exception(content_json["exception"])
        outcomes_json.append(content_json)
    
    outcomes = []
    
    # attempt to load with current version of outcomes
    # this will only work for a few days in September that use the new format
    # will not work in prod 
    try:
        for outcome_json in outcomes_json:
            new_outcome = GTFSDownloadOutcome(**outcome_json)
            outcomes.append(new_outcome)
        input_results = DownloadFeedsResult(outcomes=outcomes, **json.loads(fs.getxattr(path=f"gs://{results_file}", attr="PARTITIONED_ARTIFACT_METADATA")))
        results_to_save.append(input_results)
        
        for outcome in tqdm(outcomes):
            if outcome.success:
                ts_string = outcome.extract.ts.to_iso8601_string()
                dt_string = outcome.extract.ts.to_date_string()
                base64_url = base64.urlsafe_b64encode(outcome.config.url.encode()).decode()

                extract = outcome.extract

                old_partitions_old_bucket_path = f"gs://{OLD_SCHEDULE_RAW_BUCKET}/schedule/dt={dt_string}/base64_url={base64_url}/ts={ts_string}/{outcome.extract.filename}"
                new_partitions_old_bucket_path = f"gs://{OLD_SCHEDULE_RAW_BUCKET}/schedule/dt={dt_string}/ts={ts_string}/base64_url={base64_url}/{outcome.extract.filename}"
                new_path = f"gs://{SCHEDULE_RAW_BUCKET}/schedule/dt={dt_string}/ts={ts_string}/base64_url={base64_url}/{outcome.extract.filename}"
                if fs.exists(old_partitions_old_bucket_path):
                    moves.append((old_partitions_old_bucket_path, new_path, extract))
                if fs.exists(new_partitions_old_bucket_path):
                    moves.append((new_partitions_old_bucket_path, new_path, extract))
    except Exception as e:
        # this is the loop we will use in prod, where nothing has the new config stuff 
        try: 
            for outcome_json in outcomes_json:
                # handle config specific stuff
                old_config = outcomes_json.get("config")
                new_config = {}
                new_config["extracted_at"] = ""
                new_config["name"] = old_config.get("name")

                if old_config.get("authorization_url_parameter_name"):
                    new_config["auth_query_params"] = {old_config.get("authorization_url_parameter_name"): old_config.get("url_secret_key_name")}
                else:
                    new_config["auth_query_params"] = {}
                if old_config.get("authorization_header_parameter_name"):
                    new_config["auth_headers"] = {old_config.get("authorization_header_parameter_name"): old_config.get("header_secret_key_name")}
                else: 
                    new_config["auth_headers"] = {}
                # what to do with these
                # or should new_config["url"] = reverse engineered from b64 url? 
                # yes use b64
                new_config["feed_type"] = old_config.get("data")
                new_config["url"] = old_config.get("uri")
                new_config["schedule_url_for_validation"] = None

                #overall metadata
                new_metadata = {}
                for shared_key in ["filename", "ts", "response_code", "response_headers"]:
                    new_metadata[shared_key] = old_metadata.get(shared_key)
            
        new_metadata["config"] = new_config
                new_outcome = GTFSDownlodOutcome(
                )
        
        print(results_file)
        print(outcome)
        print(e)
        



  0%|          | 0/211 [00:00<?, ?it/s]

In [18]:
moves

[('gs://test-calitp-gtfs-schedule-raw/schedule/dt=2022-09-13/ts=2022-09-13T18:56:49.046139+00:00/base64_url=aHR0cDovL2V0YS5nZXRidXMub3JnL3J0dC9wdWJsaWMvdXRpbGl0eS9ndGZzLmFzcHg=/google_transit.zip',
  'gs://test-calitp-gtfs-schedule-raw-v2/schedule/dt=2022-09-13/ts=2022-09-13T18:56:49.046139+00:00/base64_url=aHR0cDovL2V0YS5nZXRidXMub3JnL3J0dC9wdWJsaWMvdXRpbGl0eS9ndGZzLmFzcHg=/google_transit.zip',
  GTFSScheduleFeedExtract(filename='google_transit.zip', ts=DateTime(2022, 9, 13, 18, 56, 49, 46139, tzinfo=Timezone('+00:00')), config=GTFSDownloadConfig(extracted_at=DateTime(2022, 9, 13, 18, 43, 1, 701540, tzinfo=Timezone('+00:00')), name='GET Schedule', url=HttpUrl('http://eta.getbus.org/rtt/public/utility/gtfs.aspx', scheme='http', host='eta.getbus.org', tld='org', host_type='domain', port='80', path='/rtt/public/utility/gtfs.aspx'), feed_type=<GTFSFeedType.schedule: 'schedule'>, schedule_url_for_validation=None, auth_query_params={}, auth_headers={}), response_code=200, response_headers={

In [None]:
input_results.outcomes[0].extract

In [None]:
for og_path, bucket, table, dt, base64url, ts, filename in paths:
    pdt = pendulum.parse(dt.replace("dt=", ""), exact=True)
    assert isinstance(pdt, pendulum.Date)
    if "ts" in ts:
        pts = pendulum.parse(ts.replace("ts=", ""), exact=True)
        assert isinstance(pts, pendulum.DateTime)
    elif "time" in ts:
        ptime = pendulum.parse(ts.replace("time=", ""), exact=True)
        pts = pendulum.instance(datetime.combine(pdt, ptime))
        ts = f"ts={pts.to_iso8601_string()}"
        assert isinstance(pts, pendulum.DateTime)
    new_path = "/".join([bucket, table, dt, ts, base64url, filename])
    try:
        old_metadata = json.loads(fs.getxattr(og_path, "PARTITIONED_ARTIFACT_METADATA"))
        
        # handle config specific stuff
        old_config = old_metadata.get("config")
        new_config = {}
        # TODO: what to put here? leave null after calitp update 
        new_config["extracted_at"] = ""
        new_config["name"] = old_config.get("name")
        
        # these are not getting anything.... how was auth info stored before?
        if old_config.get("authorization_url_parameter_name"):
            new_config["auth_query_params"] = {old_config.get("authorization_URL_parameter_name"): old_config.get("URL_secret_key_name")}
        else:
            new_config["auth_query_params"] = {}
        if old_config.get("authorization_header_parameter_name"):
            new_config["auth_headers"] = {old_config.get("authorization_header_parameter_name"): old_config.get("header_secret_key_name")}
        else: 
            new_config["auth_headers"] = {}
        # what to do with these
        # or should new_config["url"] = reverse engineered from b64 url? 
        # yes use b64
        new_config["feed_type"] = old_config.get("data")
        new_config["uri"] = old_config.get("uri")
        new_config["pipeline_url"] = old_config.get("pipeline_url")
        new_config["schedule_url_for_validation"] = None
        
        #overall metadata
        new_metadata = {}
        for shared_key in ["filename", "ts", "response_code", "response_headers"]:
            new_metadata[shared_key] = old_metadata.get(shared_key)
            
        new_metadata["config"] = new_config
        
    except KeyError as e:
        new_metadata = None
    moves.append((og_path, new_path, new_metadata))
moves[:5]

In [None]:
for og_path, new_path in tqdm(moves):
    #print(og_path, new_path)
    #break
    fs.mv(og_path, new_path)