# Real historical backfill aka agencies.yml v1 to post-GTFSDownloadConfig v2

In [None]:
import os
from tqdm.auto import tqdm

os.environ["CALITP_BUCKET__GTFS_SCHEDULE_RAW"] = "gs://test-calitp-gtfs-schedule-raw-v2"
from calitp.storage import get_fs
from google.cloud import storage

fs = get_fs()
PARTITIONED_ARTIFACT_METADATA_KEY = "PARTITIONED_ARTIFACT_METADATA"
client = storage.Client(project="cal-itp-data-infra")
fs, client

In [None]:
import pendulum
first_date = pendulum.parse("2021-04-15", exact=True)
first_date_v2 = pendulum.parse("2022-07-07", exact=True)
folders = []
for d in fs.ls("gs://gtfs-schedule-backfill-test/schedule/"):
    if "T00" not in d:
        # skip some old ones that don't have midnight execution times, we probably shouldn't trust them?
        continue
    ts = pendulum.parse(d.split("/")[-1])
    if first_date <= ts.date() and ts.date() <= first_date_v2:
        folders.append(d)
len(folders), folders[0], folders[-1]

In [None]:
list(fs.walk("gs://gtfs-data-test/schedule/2021-04-15T00:00:00+00:00/0_0/"))

In [None]:
list(fs.walk("gs://gtfs-schedule-backfill-test/schedule/2021-04-15T00:00:00+00:00/3_4/"))

In [None]:
fs.stat("gs://gtfs-data-test/schedule/2021-04-15T00:00:00+00:00/0_0/stops.txt")

In [None]:
set(tuple(fs.ls(feed_dir)) for folder in tqdm(folders) for feed_dir in fs.ls(folder))

In [None]:
import base64
from typing import ClassVar, List, Optional

import pendulum
from calitp.storage import (
    GTFSDownloadConfig,
    GTFSFeedType,
    GTFSScheduleFeedExtract,
    PartitionedGCSArtifact,
    ProcessingOutcome,
)
from google.cloud import storage
from pydantic import HttpUrl, parse_obj_as


class GTFSDownloadOutcome(ProcessingOutcome):
    config: GTFSDownloadConfig
    extract: Optional[GTFSScheduleFeedExtract]


class DownloadFeedsResult(PartitionedGCSArtifact):
    bucket: ClassVar[str] = "gs://test-calitp-gtfs-schedule-raw-v2"
    table: ClassVar[str] = "download_schedule_feed_results"
    partition_names: ClassVar[List[str]] = ["dt", "ts"]
    ts: pendulum.DateTime
    end: pendulum.DateTime
    outcomes: List[GTFSDownloadOutcome]

    @property
    def dt(self) -> pendulum.Date:
        return self.ts.date()

In [None]:
with fs.open(f"gs://{folders[0]}/status.csv", "r") as f:
    reader = csv.DictReader(f)
    rows = list(reader)
rows[0]

In [None]:
import csv
import re
import json
from pydantic import ValidationError, parse_obj_as
from zipfile import ZipFile
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor, Future, as_completed
import io

jinja_pattern = r"(?<=\?)(?:\w+)=\w+&?"

class SkipUrl(Exception):
    pass

def zip_one_feed(folder, feed, zip_one_feed_fs=None):
    zip_one_feed_fs = get_fs() if not zip_one_feed_fs else zip_one_feed_fs
    feed_key = f"{feed['itp_id']}_{feed['url_number']}"
    url = re.sub(jinja_pattern, "", feed["gtfs_schedule_url"])

    assert url and "token" not in url and "api_key" not in url

    try:
        validated_url = parse_obj_as(HttpUrl, url)
    except ValidationError:
        if url.startswith("http://.232"):
            raise SkipUrl
        raise

    config = GTFSDownloadConfig(
        extracted_at=None,
        name=feed["agency_name"],  # TODO: CHANGE ME PLEASE
        url=validated_url,
        feed_type=GTFSFeedType.schedule,
        schedule_url_for_validation=None,
        auth_query_params={},
        auth_headers={},
    )

    if feed['status'] != "success":
        return GTFSDownloadOutcome(
            success=False,
            exception=Exception(feed['status']),
            config=config,
            extract=None,
        ), None, None

    to_walk = f"{folder}/{feed_key}"
    feed_dir = f"./{feed_key}"
    zipfile_path = f"{feed_key}.zip"
    files_to_timestamps = {}

    zipfile_bytes = io.BytesIO()
    with ZipFile(zipfile_bytes, 'w') as zipf:
        for current_dir, sub_dirs, files in zip_one_feed_fs.walk(to_walk):
            if current_dir.endswith("processed"):
                continue
            
            for file in files:
                file = f"gs://{current_dir}/{file}"
                if file.endswith("validation.json"):
                    continue
                files_to_timestamps[file] = pendulum.parse(zip_one_feed_fs.stat(file)['customTime'], exact=True).replace(microsecond=0)
                zipf.writestr(os.path.basename(file), zip_one_feed_fs.cat(file))
    
    if not files_to_timestamps:
        print(feed_key, to_walk, list(zip_one_feed_fs.walk(f"{folder}/{feed_key}")))
        raise RuntimeError
    
    first_ts = min(files_to_timestamps.values())
    last_ts = max(files_to_timestamps.values())
    #print(first_ts, last_ts)
    
    if (last_ts - first_ts).total_seconds() > 600:
        print("got weirdly long extract: ", (last_ts - first_ts))

    extract = GTFSScheduleFeedExtract(
        ts=first_ts,
        config=config,
        response_code=200, # this is somewhat assumed
        filename="reconstructed.zip",
    )

    outcome = GTFSDownloadOutcome(
        success=True,
        exception=None,
        config=config,
        extract=extract,
    )

    return outcome, extract, zipfile_bytes

def handle_one_folder(folder, handle_one_folder_fs=None, pool=None):
    fs = handle_one_folder_fs or get_fs()
    outcomes_extracts_bytes = []

    with fs.open(f"gs://{folder}/status.csv", "r") as f:
        rows = list(csv.DictReader(f))
    
    skipped = 0
    
    print(f"Handling {folder} with {pool._max_workers} threads")
    if pool._max_workers > 1:
        pbar = tqdm(total=len(rows), desc=folder)
        futures = {
            pool.submit(
                zip_one_feed,
                folder=folder,
                feed=feed,
            ): feed
            for feed in rows
        }

        for future in as_completed(futures):
            feed = futures[future]
            pbar.update()
            try:
                outcomes_extracts_bytes.append(future.result())
            except KeyboardInterrupt:
                raise
            except SkipUrl:
                print(f"skipped {folder} {feed['gtfs_schedule_url']}")
                skipped += 1
            except Exception:
                print(feed)
                raise
    else:
        for feed in rows:
            try:
                outcomes_extracts_bytes.append(zip_one_feed(folder=folder, feed=feed, zip_one_feed_fs=fs))
            except SkipUrl:
                print(f"skipped {folder} {feed}")
                skipped += 1
    
    outcomes = [tup[0] for tup in outcomes_extracts_bytes]
    assert len(rows) == len(outcomes) + skipped
    result = DownloadFeedsResult(
        ts=min(outcome.extract.ts for outcome in outcomes if outcome.extract),
        end=max(outcome.extract.ts for outcome in outcomes if outcome.extract),
        outcomes=outcomes,
        filename="results.jsonl",
    )
    assert result.path.startswith("gs://test-calitp-gtfs-schedule-raw-v2/download_schedule_feed_results")
    assert all(extract.path.startswith("gs://test-calitp-gtfs-schedule-raw-v2/schedule") for _, extract, _ in outcomes_extracts_bytes if extract)
    return result, len(outcomes_extracts_bytes)

#result, outcomes_extracts_bytes = handle_one_folder("gtfs-schedule-backfill-test/schedule/2021-04-17T00:00:00+00:00", threads=12)
#result.path, extracts

In [None]:
with ThreadPoolExecutor(max_workers=32) as pool:
    fs = get_fs()
    for folder in tqdm(folders):
        result, extracts = handle_one_folder(folder=folder, handle_one_folder_fs=fs, pool=pool)
        print(result.path, extracts)