In [1]:
from dotenv import load_dotenv
import json
import os

import numpy as np
import pandas as pd
from pyairtable import Api
import requests


from calitp_data_analysis.sql import to_snakecase
import script

In [2]:
load_dotenv()
MTC_API_KEY = os.getenv('MTC_API_KEY')
api = Api(os.getenv('AIRTABLE_TOKEN'))
mtc_operator_url = f"https://api.511.org/transit/operators?api_key={MTC_API_KEY}"

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
# Trying to stay consistent with 
# https://github.com/cal-itp/data-infra/blob/main/airflow/plugins/operators/airtable_to_gcs.py
def all_rows_as_df(base_id, table_name):
    all_rows = api.table(base_id=base_id, table_name=table_name).all()

    df = pd.DataFrame(
        [
            {"id":row["id"], **row["fields"]}
            for row in all_rows
        ]
    )
    return df

def takeout_list(x):
    if x is not np.nan:
        return x[0]


In [5]:
TABLE_TRANSIT_DATA_QUALITY_ISSUES_ID = 'appmBGOFTvsDv4jdJ'

In [6]:
issues_df = all_rows_as_df(TABLE_TRANSIT_DATA_QUALITY_ISSUES_ID, 'tblEv7QTfEmypU6gg')
issue_types = all_rows_as_df(TABLE_TRANSIT_DATA_QUALITY_ISSUES_ID, 'tblupkIe04LxEPWSR')
services_df = all_rows_as_df(TABLE_TRANSIT_DATA_QUALITY_ISSUES_ID, 'tblBZtbuntv4D0i1u')

In [7]:
#Clean up columns where data is a bunch of single item lists
issues_df['Issue Type'] = issues_df['Issue Type'].apply(takeout_list)
issues_df['Services'] = issues_df['Services'].apply(takeout_list) 

In [8]:
partial = pd.merge(issues_df,issue_types[['id','Name','Notes']],how='left',left_on='Issue Type', right_on='id').drop(columns=['id_x','Issue Type','Created By', 'Last Modified', 'Last Modified By']).rename(columns={'Name':'Issue Type'})

In [9]:
df = pd.merge(partial, services_df[['id','Name']], how='left', left_on='Services', right_on='id').drop(columns=['Services','id_y']).rename(columns={'Name':'Service Name'})
# df.head()

In [None]:
from calitp_data_analysis.sql import query_sql
issues = query_sql("""
SELECT ts
FROM `cal-itp-data-infra.mart_gtfs.fct_schedule_feed_downloads` fs
left join
`mart_transit_database.dim_provider_gtfs_data` dp
ON fs.gtfs_dataset_key = dp.schedule_gtfs_dataset_key
left join
`mart_transit_database.dim_services` ds 
ON dp.service_key = ds.key
left join
`mart_transit_database.bridge_services_x_operating_county_geographies` bs
ON ds.key = bs.service_key
left join
`mart_transit_database.dim_county_geography` dc
ON bs.county_geography_key = dc.key
WHERE
unzip_exception IS NOT NULL
OR
download_exception IS NOT NULL
OR
pct_files_successfully_parsed < 100
AND
dp._is_current=true
order by ts DESC 
LIMIT 100
""", as_df=True)
# issues
# issues

In [None]:
full.columns

In [None]:
issues