# Real historical backfill aka agencies.yml v1 to post-GTFSDownloadConfig v2

In [None]:
import os

os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"] = "gs://test-calitp-gtfs-schedule-raw-v2"
from calitp.storage import get_fs
from google.cloud import storage

fs = get_fs()
PARTITIONED_ARTIFACT_METADATA_KEY = "PARTITIONED_ARTIFACT_METADATA"
client = storage.Client(project="cal-itp-data-infra")
fs, client

In [None]:
first_date = pendulum.parse("2021-04-15", exact=True)
first_date_v2 = pendulum.parse("2022-07-07", exact=True)
dirs = []
for d in fs.ls("gs://gtfs-data-test/schedule/"):
    if "T00" not in d:
        # skip some old ones that don't have midnight execution times, we probably shouldn't trust them?
        continue
    ts = pendulum.parse(d.split("/")[-1])
    if first_date <= ts.date() and ts.date() <= first_date_v2:
        dirs.append(d)
len(dirs), dirs[0], dirs[-1]

In [None]:
import base64
from typing import ClassVar, List, Optional

import pendulum
from calitp.storage import (
    GTFSDownloadConfig,
    GTFSFeedType,
    GTFSScheduleFeedExtract,
    PartitionedGCSArtifact,
    ProcessingOutcome,
)
from google.cloud import storage
from pydantic import HttpUrl, parse_obj_as
from tqdm.auto import tqdm


class GTFSDownloadOutcome(ProcessingOutcome):
    config: GTFSDownloadConfig
    extract: Optional[GTFSScheduleFeedExtract]


class DownloadFeedsResult(PartitionedGCSArtifact):
    bucket: ClassVar[str] = "gs://test-calitp-gtfs-schedule-raw-v2"
    table: ClassVar[str] = "download_schedule_feed_results"
    partition_names: ClassVar[List[str]] = ["dt", "ts"]
    ts: pendulum.DateTime
    end: pendulum.DateTime
    outcomes: List[GTFSDownloadOutcome]

    @property
    def dt(self) -> pendulum.Date:
        return self.ts.date()

In [None]:
with fs.open(f"gs://{dirs[0]}/status.csv", "r") as f:
    reader = csv.DictReader(f)
    rows = list(reader)
rows[0]

In [None]:
import csv
import re
from pydantic import ValidationError

jinja_pattern = r"(?<=\?)(?:\w+)=\w+&?"


def handle_one_folder(folder):
    outcomes = []
    zips = []

    with fs.open(f"gs://{folder}/status.csv", "r") as f:
        reader = csv.DictReader(f)
        rows = list(reader)

    for feed in rows:
        url = re.sub(jinja_pattern, "", feed["gtfs_schedule_url"])

        assert url and "token" not in url and "api_key" not in url

        try:
            validated_url = parse_obj_as(HttpUrl, url)
        except ValidationError:
            if url.startswith("http://.232"):
                continue
            raise
        
        config = GTFSDownloadConfig(
            extracted_at=None,
            name=feed["agency_name"],  # TODO: CHANGE ME PLEASE
            url=validated_url,
            feed_type=GTFSFeedType.schedule,
            schedule_url_for_validation=None,
            auth_query_params={},
            auth_headers={},
        )

        if feed['status'] != "success":
            outcomes.append(
                GTFSDownloadOutcome(
                    success=False,
                    exception=Exception(feed['status']),
                    config=config,
                    extract=None,
                )
            )
            continue
    return outcomes, zips


for folder in tqdm(dirs):
    handle_one_folder(folder)

In [None]:
def handle_one_results_blob(results_blob, dry_run=True):
    to_copy = []
    new_outcomes = []
    with fs.open(f"gs://{results_blob.bucket.name}/{results_blob.name}") as f:
        old_outcomes = [json.loads(line) for line in f.readlines()]
    for result in tqdm(old_outcomes, desc=results_blob.name):
        old_extract = result["extract"]
        old_airtable_record = result["airtable_record"]

        # we always need to be able to get a download config, even if there's no "real" file underlying it
        new_config = GTFSDownloadConfig(
            extracted_at=None,
            name=old_airtable_record["name"],
            url=parse_obj_as(HttpUrl, old_airtable_record["pipeline_url"]),
            feed_type=GTFSFeedType.schedule,
            schedule_url_for_validation=None,
            auth_query_params={},
            auth_headers={},
        )

        # if we had a failure, there is no extract to copy
        if not result["extract"]:
            new_outcomes.append(
                GTFSDownloadOutcome(
                    success=result["success"],
                    exception=Exception(result["exception"]),
                    config=new_config,
                    extract=None,
                )
            )
            continue

        # if we were successful, we should have a
        ts = pendulum.parse(old_extract["ts"])
        dt = ts.date()
        base64_url = base64.urlsafe_b64encode(
            old_airtable_record["pipeline_url"].encode()
        ).decode()

        # the old v2 files have url then ts, but we will be swapping them
        old_blob_key = f"schedule/dt={dt.to_date_string()}/base64_url={base64_url}/ts={ts.to_iso8601_string()}/{old_extract['filename']}"
        old_blob = results_blob.bucket.get_blob(old_blob_key)
        if old_blob is None:
            # print(new_config.url)
            new_outcomes.append(
                GTFSDownloadOutcome(
                    success=False,
                    exception=Exception("blob missing during backfill operation"),
                    config=new_config,
                    extract=None,
                )
            )
            continue

        assert old_blob.metadata[PARTITIONED_ARTIFACT_METADATA_KEY] == json.dumps(
            old_extract
        )

        new_extract = GTFSScheduleFeedExtract(
            filename=old_extract["filename"],
            config=new_config,
            response_code=old_extract["response_code"],
            response_headers=old_extract["response_headers"],
            ts=old_extract["ts"],
        )

        to_copy.append((f"gs://{old_blob.bucket.name}/{old_blob.name}", new_extract))
        new_outcomes.append(
            GTFSDownloadOutcome(
                success=result["success"],
                exception=result["exception"],
                config=new_config,
                extract=new_extract,
            )
        )

    # only copy results once successful
    old_metadata = json.loads(results_blob.metadata[PARTITIONED_ARTIFACT_METADATA_KEY])
    new_result = DownloadFeedsResult(
        ts=pendulum.parse(old_metadata["ts"]),
        end=pendulum.parse(old_metadata["end"]),
        outcomes=new_outcomes,
        filename=old_metadata["filename"],
    )
    assert len(old_outcomes) == len(new_outcomes)
    assert len(to_copy) == len(
        [result for result in new_result.outcomes if result.success]
    )
    if not dry_run:
        pass
    return new_result, to_copy


results_to_save = []
to_copies = []

for results_blob in tqdm(old_v2_outcomes):
    new_result, to_copy = handle_one_results_blob(results_blob=results_blob)
    results_to_save.append(new_result)
    to_copies.extend(to_copy)
len(results_to_save), len(to_copies)

for src, dst in tqdm(to_copies):
    assert src.startswith("gs://test-calitp-gtfs-schedule-raw/schedule/")
    assert dst.path.startswith("gs://test-calitp-gtfs-schedule-raw-v2/schedule/")
    cp_args = (src, dst.path)
    setxattr_kwargs = {"path": dst.path, PARTITIONED_ARTIFACT_METADATA_KEY: dst.json()}
    # print(cp_args)
    # print(setxattr_kwargs)
    # fs.cp(*cp_args)
    # fs.setxattr(**setxattr_kwargs)
    # break

for result in results_to_save:
    assert result.path.startswith(
        "gs://test-calitp-gtfs-schedule-raw-v2/download_schedule_feed_results/"
    )
    print(result.path)
    # result.save(fs)