# Old script from Conveyal for reference!

* aka the hard way

In [None]:
import os
import pandas
import requests
import yaml
import math
from zipfile import ZipFile

# Download composite GTFS files from data.ca.gov (note: schema has since changed)
prefix = 'https://urldefense.com/v3/__https://data.ca.gov/dataset/de6f1544-b162-4d16-997b-c183912c8e62/resource/'*5Cn__;JQ!!LWi6xHDyrA!74r966TgmHqnpM1IjGtbx-Gy1A31nIfAmlQa0JWgflqOPTrP4A4pPHr1ApHdB3iKkcnkBmZ11Aflg-QpRykoAbMhUw$ sourceFiles = {
    'agency': 'e8f9d49e-2bb6-400b-b01f-28bc2e0e7df2/download/agency.csv',
    'routes': 'c6bbb637-988f-431c-8444-aef7277297f8/download/routes.csv',
    'stops': '8c876204-e12b-48a2-8299-10f6ae3d4f2b/download/stops.csv',
    'stop_times': 'd31eef2f-e223-4ca4-a86b-170acc6b2590/download/stop_times.csv',
    'trips': '0e4da89e-9330-43f8-8de9-305cb7d4918f/download/trips.csv',
    'calendar': 'a79f10b8-b322-43f3-b3f4-ba46a8dbe9ab/download/calendar.csv',
    'calendar_dates': '06a21a8e-dba3-4e7e-8726-f2e992cc1a80/download/calendar_dates.csv',
    'feed_info': '50d12559-635e-4222-ac25-3706c066902d/download/feed_info.csv',
    'frequencies': '48542c8f-8ce1-43e3-a965-6c68771d6fe5/download/frequencies.csv',
    'shapes': '2f5e7bdb-33e8-4633-b163-6bab42ad0951/download/shapes.csv'
}

# Formatting issue: data.ca.gov adds dashes to dates.
dateColumns = ['date', 'start_date', 'end_date', 'feed_start_date', 'feed_end_date']
intColumns = ['pickup_type', 'drop_off_type', 'timepoint', 'direction_id', 'location_type', 'wheelchair_accessible', 'bikes_allowed', 'exact_times']

multiFeedAgencies = [182] # LA Metro
ignoredAgencies = [200] # MTC regional feed

# TODO skip repeated URLs in agencies.yml file

In [None]:
for fileName in sourceFiles:
    fileUrl = prefix + sourceFiles[fileName]
    fileLoc = fileName + '.txt'
    file = requests.get(fileUrl)
    open(fileLoc, 'wb').write(file.content)
    print('Downloaded source ' + fileName + ' file.')

In [None]:
# Set bounds for northern california
north = 42.03909
south = 39.07038
east = -119.60541
west = -124.49158

# ... or set bounds for central california
north = 39.64165
south = 35.87347
east = -117.53174
west = -123.83789

# ... or set bounds for southern california
north = 35.8935
south = 32.5005
east = -114.13121
west = -121.46759

# ... or set bounds for mojave
north = 37.81629
south = 34.89945
east = -114.59015
west = -118.38043

In [None]:
stops = pandas.read_csv('stops.txt')

keysToTrack = set()

for index, stop in stops.iterrows():
    if (stop.calitp_itp_id not in ignoredAgencies and (stop.calitp_url_number == 0 or stop.calitp_itp_id in multiFeedAgencies)):
        if (stop.stop_lat > south and stop.stop_lat < north and stop.stop_lon > west and stop.stop_lon < east): 
            keysToTrack.add((stop.calitp_itp_id, stop.calitp_url_number))
            
for key in keysToTrack:
    stringKey = str(key[0]) + '_' + str(key[1])
    if (not os.path.isdir('out/' + stringKey)):
        os.mkdir('out/' + stringKey)

In [None]:
for file in sourceFiles:
    print('Processing ' + file + ' file.')
    df1 = pandas.read_csv(file + '.txt')
    for key in keysToTrack:
        outputFilename = 'out/'+ str(key[0]) + '_' + str(key[1]) + '/' + file + '.txt'
        df = df1[(df1['calitp_itp_id'] == key[0]) & (df1['calitp_url_number'] == key[1])].copy()
        # TODO filter out rows with missing id values (e.g. Tuolumne)
        for col in dateColumns:
            if(col in df.columns):
                df[col] = df[col].str.replace('-','')
        for col in intColumns:
            if(col in df.columns):
                df[col] = df[col].fillna(0).astype(int)
        if (len(df) > 0):
            df.to_csv(outputFilename, index=False)        

In [None]:
for key in keysToTrack:
    stringKey = str(key[0]) + '_' + str(key[1])
    with ZipFile('out/' + stringKey + '.zip', 'w') as zip:
        for file in sourceFiles:
            try:
                zip.write('out/' + stringKey + '/' + file + '.txt', arcname=(file.split('/')[-1] + '.txt'))
            except FileNotFoundError:
                pass

In [None]:
# Check for feeds that don't overlap the target date in calendar.txt or calendar_dates.txt
# Note that Conveyal bundle details show dates from feed_info, which may not reflect service accurately.
targetDate = '2022-11-30'
calitpAgences = {}

with open('agencies.yml', 'r') as file:
    agencies = yaml.safe_load(file)
    
for agency in agencies:
    calitpAgences[agencies[agency]['itp_id']] = agency

calendar = pandas.read_csv('calendar.txt')
calendarDates = pandas.read_csv('calendar_dates.txt')

dateRange = pandas.DataFrame(columns = ['key', 'c_start', 'c_end', 'cd_start', 'cd_end'])
dateRange.set_index('key', inplace=True)

for index, row in calendar.iterrows():
    if (row.calitp_itp_id not in ignoredAgencies and (row.calitp_url_number == 0 or row.calitp_itp_id in multiFeedAgencies)):
        key = str(row.calitp_itp_id) + '_' + str(row.calitp_url_number)
        if key in dateRange.index:
            dateRange.at[key, 'c_start'] = min(dateRange.at[key, 'c_start'], row.start_date)
            dateRange.at[key, 'c_end'] = max(dateRange.at[key, 'c_end'], row.end_date)
        else:
            dateRange.at[key, 'c_start'] = row.start_date
            dateRange.at[key, 'c_end'] = row.end_date
            dateRange.at[key, 'cd_start'] = '9999-99-99'
            dateRange.at[key, 'cd_end'] = '0000-00-00'

for index, row in calendarDates.iterrows():
    if (row.calitp_itp_id not in ignoredAgencies and (row.calitp_url_number == 0 or row.calitp_itp_id in multiFeedAgencies)):
        key = str(row.calitp_itp_id) + '_' + str(row.calitp_url_number)
        if key in dateRange.index:
            dateRange.at[key, 'cd_start'] = min(dateRange.at[key, 'cd_start'], row.date)
            dateRange.at[key, 'cd_end'] = max(dateRange.at[key, 'cd_end'], row.date)
        else:
            dateRange.at[key, 'cd_start'] = '9999-99-99'
            dateRange.at[key, 'cd_end'] = '0000-00-00'

# log dates to manually check for overlap
for index, row in dateRange.iterrows():
    if row.c_start > targetDate or row.c_end < targetDate:
        agency = calitpAgences[int(index.split('_')[0])]
        print(agency + ', ' + index + ', ' + row.c_start + ', ' + row.c_end + ', ' + row.cd_start + ', ' + row.cd_end)

In [None]:
# Scratch: read agency data

# Prereq: ensure https://urldefense.com/v3/__https://github.com/cal-itp/data-infra/blob/main/airflow/data/agencies.yml__;!!LWi6xHDyrA!74r966TgmHqnpM1IjGtbx-Gy1A31nIfAmlQa0JWgflqOPTrP4A4pPHr1ApHdB3iKkcnkBmZ11Aflg-QpRykku6YuzQ$  has been downloaded

# Load Cal-ITP agency id/name/URL info
# Note most agencies only have one URL, but some have multiple (e.g. LA Metro has separate URLs for bus and rail)
# So id and url number form a compound unique key
with open('agencies.yml', 'r') as file:
    agencies = yaml.safe_load(file)
    
for agency in agencies:
    for urlNum in range(0, len(agencies[agency]['feeds'])):
        os.mkdir('out/' + agency.replace('/','_') + '_' + str(urlNum))