# Real historical backfill aka agencies.yml v1 to post-GTFSDownloadConfig v2

In [None]:
import os

from tqdm.auto import tqdm

os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"] = "gs://test-calitp-gtfs-schedule-raw-v2"
from calitp.storage import get_fs
from google.cloud import storage

fs = get_fs()
PARTITIONED_ARTIFACT_METADATA_KEY = "PARTITIONED_ARTIFACT_METADATA"
client = storage.Client(project="cal-itp-data-infra")
fs, client

In [None]:
import pendulum

first_date = pendulum.parse("2021-04-15", exact=True)
first_date_v2 = pendulum.parse("2022-09-15", exact=True)
folders = []
for d in fs.ls("gs://gtfs-schedule-backfill-test/schedule/"):
    if "T00" not in d:
        # skip some old ones that don't have midnight execution times, we probably shouldn't trust them?
        continue
    ts = pendulum.parse(d.split("/")[-1])
    if first_date <= ts.date() and ts.date() < first_date_v2:
        folders.append(d)
len(folders), folders[0], folders[-1]

In [None]:
import base64
from typing import ClassVar, List, Optional

import pendulum
from calitp.storage import (
    GTFSDownloadConfig,
    GTFSFeedType,
    GTFSScheduleFeedExtract,
    PartitionedGCSArtifact,
    ProcessingOutcome,
)
from google.cloud import storage
from pydantic import HttpUrl, parse_obj_as


class GTFSDownloadOutcome(ProcessingOutcome):
    config: GTFSDownloadConfig
    extract: Optional[GTFSScheduleFeedExtract]


class DownloadFeedsResult(PartitionedGCSArtifact):
    bucket: ClassVar[str] = os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"]
    table: ClassVar[str] = "download_schedule_feed_results"
    partition_names: ClassVar[List[str]] = ["dt", "ts"]
    ts: pendulum.DateTime
    end: pendulum.DateTime
    outcomes: List[GTFSDownloadOutcome]
    backfilled: bool = False

    @property
    def dt(self) -> pendulum.Date:
        return self.ts.date()

    def save(self, fs):
        self.save_content(
            fs=fs,
            content="\n".join(o.json() for o in self.outcomes).encode(),
            exclude={"outcomes"},
        )
        return self

In [None]:
import csv
import io
import json
import re
from collections import Counter
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
from pprint import pprint
from zipfile import ZipFile

from pydantic import ValidationError, parse_obj_as

jinja_pattern = r"(?<=\?)(?:\w+)=\w+&?"


class SkipUrl(Exception):
    pass


def zip_one_feed(folder, feed, zip_one_feed_fs=None, dry_run=True, pbar=None):
    zip_one_feed_fs = get_fs() if not zip_one_feed_fs else zip_one_feed_fs
    feed_key = f"{feed['itp_id']}_{feed['url_number']}"
    url = re.sub(jinja_pattern, "", feed["gtfs_schedule_url"]).rstrip("?")

    assert url and "token" not in url and "api_key" not in url

    try:
        validated_url = parse_obj_as(HttpUrl, url)
    except ValidationError:
        if url.startswith("http://.232"):
            raise SkipUrl
        raise

    config = GTFSDownloadConfig(
        extracted_at=None,
        name=feed["agency_name"],  # TODO: CHANGE ME PLEASE
        url=validated_url,
        feed_type=GTFSFeedType.schedule,
        schedule_url_for_validation=None,
        auth_query_params={},
        auth_headers={},
        computed=True,
    )

    if feed["status"] != "success":
        return (
            GTFSDownloadOutcome(
                success=False,
                exception=Exception(feed["status"]),
                config=config,
                extract=None,
            ),
            None,
            None,
        )

    to_walk = f"{folder}/{feed_key}"
    feed_dir = f"./{feed_key}"
    zipfile_path = f"{feed_key}.zip"
    files_to_timestamps = {}

    bytesio = io.BytesIO()
    with ZipFile(bytesio, "w") as zipf:
        for current_dir, sub_dirs, files in zip_one_feed_fs.walk(to_walk):
            if current_dir.endswith("processed"):
                continue

            for file in files:
                file = f"gs://{current_dir}/{file}"
                if file.endswith("validation.json"):
                    continue
                files_to_timestamps[file] = (
                    pendulum.parse(zip_one_feed_fs.stat(file)["customTime"], exact=True)
                    .in_tz("Etc/UTC")
                    .replace(microsecond=0)
                )
                zipf.writestr(os.path.basename(file), zip_one_feed_fs.cat(file))
    bytesio.seek(0)
    zipfile_bytes = bytesio.read()
    if not files_to_timestamps:
        print(feed_key, to_walk, list(zip_one_feed_fs.walk(f"{folder}/{feed_key}")))
        raise RuntimeError

    first_ts = min(files_to_timestamps.values())
    last_ts = max(files_to_timestamps.values())
    # print(first_ts, last_ts)

    if (last_ts - first_ts).total_seconds() > 600:
        print("got weirdly long extract: ", (last_ts - first_ts))

    extract = GTFSScheduleFeedExtract(
        ts=first_ts,
        config=config,
        response_code=200,  # this is somewhat assumed
        filename="reconstructed.zip",
        reconstructed=True,
    )

    assert "+00:00/base64_url" in extract.path
    if not dry_run:
        extract.save_content(fs=zip_one_feed_fs, content=zipfile_bytes)

    outcome = GTFSDownloadOutcome(
        success=True,
        exception=None,
        config=config,
        extract=extract,
    )

    return outcome, extract, zipfile_bytes


def handle_one_folder(
    folder, handle_one_folder_fs=None, pool=None, dry_run=True, top_pbar=None, i=None
):
    if dry_run:
        print("DRY RUN")
    if handle_one_folder_fs:
        fs = handle_one_folder_fs
    else:
        fs = get_fs()
    outcomes_extracts_bytes = []

    with fs.open(f"gs://{folder}/status.csv", "r") as f:
        rows = list(csv.DictReader(f))

    deduplicated = {feed["gtfs_schedule_url"]: feed for feed in rows}
    # most_common = Counter(feed["gtfs_schedule_url"] for feed in rows).most_common(1)[0]
    # if most_common[1] > 1:
    # raise RuntimeError(f"found duplicate urls: {most_common}")

    skipped = 0

    # print(f"Handling {folder} with {pool._max_workers} threads")
    if pool._max_workers > 1:
        pbar = tqdm(total=len(deduplicated), desc=f"{i} {folder}", leave=False)
        futures = {
            pool.submit(
                zip_one_feed,
                folder=folder,
                feed=feed,
                dry_run=dry_run,
                pbar=pbar,
                zip_one_feed_fs=handle_one_folder_fs,
            ): feed
            for feed in deduplicated.values()
        }

        for future in as_completed(futures):
            feed = futures[future]
            pbar.update()
            try:
                outcomes_extracts_bytes.append(future.result())
            except KeyboardInterrupt:
                raise
            except SkipUrl:
                # pbar.write(f"skipped {folder} {feed['gtfs_schedule_url']}")
                skipped += 1
            except Exception:
                print(feed)
                raise
    else:
        for feed in deduplicated.values():
            try:
                outcomes_extracts_bytes.append(
                    zip_one_feed(folder=folder, feed=feed, zip_one_feed_fs=fs)
                )
            except SkipUrl:
                print(f"skipped {folder} {feed}")
                skipped += 1

    outcomes = [tup[0] for tup in outcomes_extracts_bytes]
    assert len(deduplicated) == (len(outcomes) + skipped)

    result = DownloadFeedsResult(
        ts=min(outcome.extract.ts for outcome in outcomes if outcome.extract),
        end=max(outcome.extract.ts for outcome in outcomes if outcome.extract),
        outcomes=outcomes,
        filename="results.jsonl",
        backfilled=True,
    )

    assert result.path.startswith(
        f'{os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"]}/download_schedule_feed_results'
    ) and result.path.endswith("+00:00/results.jsonl")
    assert all(
        extract.path.startswith(
            f'{os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"]}/schedule'
        )
        and "+00:00/base64_url" in extract.path
        for _, extract, _ in outcomes_extracts_bytes
        if extract
    )
    result_path = result.save(fs).path
    top_pbar.write(
        f"i:{i} rows:{len(rows)} dedup:{len(deduplicated)} outs:{len(outcomes)} skip:{skipped} result: {result_path}"
    )
    # if dry_run:
    # return pool.submit(lambda: result)
    # return pool.submit(result.save, fs=fs)


# result, outcomes_extracts_bytes = handle_one_folder("gtfs-schedule-backfill-test/schedule/2021-04-17T00:00:00+00:00", threads=12)
# result.path, extracts

In [None]:
with ThreadPoolExecutor(max_workers=32) as pool:
    # fs = get_fs()
    folders_pbar = tqdm(folders)
    for i, folder in enumerate(folders_pbar):
        try:
            handle_one_folder(
                folder=folder,
                handle_one_folder_fs=fs,
                pool=pool,
                dry_run=False,
                top_pbar=folders_pbar,
                i=i,
            )
        except FileNotFoundError as e:
            print(f"unable to find status in {folder}")
        # break