# Querying the Warehouse

This notebook goes through the few ways you can connect to tables in the warehouse, whether it be tables orginating from Airtable or those being generated from GTFS feeds. 

To see what tables are available to query, check out the [DBT docs](https://dbt-docs.calitp.org/#!/overview), and go to `calitp_warehouse`>>`models`>>`mart`.

The tables from Airtable are located in `transit_database` or `mart_transit_database`


This is also detailed in the [Cal-ITP Docs](https://docs.calitp.org/data-infra/analytics_tools/python_libraries.html?highlight=tbls).

In [1]:
import pandas as pd

import shared_utils



In [2]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?


In [3]:
pd.options.display.max_columns = 100

## Using Query_Sql

You can use the same sql syntax you would use in Big Query to query a table. This function takes the sql query and allows you to work directly with the output as a pandas dataframe. 

In [4]:
## import the package that has the query_sql function

from calitp_data_analysis import sql, magics

In [5]:
dim_orgs_sql = sql.query_sql(f'''
SELECT * FROM `cal-itp-data-infra.mart_transit_database.dim_organizations`
''')

In [6]:
dim_orgs_sql.head()

Unnamed: 0,key,source_record_id,name,organization_type,roles,itp_id,details,caltrans_district,website,reporting_category,hubspot_company_record_id,gtfs_static_status,gtfs_realtime_status,_deprecated__assessment_status,manual_check__contact_on_website,alias,is_public_entity,ntd_id,public_currently_operating,public_currently_operating_fixed_route,_is_current,_valid_from,_valid_to
0,c94c49e21a8ee2e6716fb0194f64cde0,recaBKK3NlO5CMsyD,,,[],,,,,,,Static OK,RT Incomplete,,,[],,,,,False,2022-07-08 00:00:00+00:00,2022-07-08 23:59:59.999999+00:00
1,c82f1f1a872cda2ba3ceb0e9228d6dbb,recQQo6hAZWEs5y0h,,,[],,,,,,,Static OK,RT Incomplete,False,,[],,,,,False,2023-04-13 00:00:00+00:00,2023-04-27 23:59:59.999999+00:00
2,728c79b2b4f30f00fab2d1809dc7eaae,recFwTA2CJkcSZBUU,,,[],,,,,,,Static OK,RT Incomplete,,,[],,,,,False,2022-09-23 00:00:00+00:00,2022-10-17 23:59:59.999999+00:00
3,e4524bd70b6a3c768976441b281f3b06,recFwTA2CJkcSZBUU,,,[],,,,,,,Static OK,RT Incomplete,,Unknown,[],,,,,False,2022-10-18 00:00:00+00:00,2022-10-24 23:59:59.999999+00:00
4,536256e13e6a90d0ab5a72a17484d756,reci6D14cTTXA4GsG,,,[],,,,,,,Static OK,RT Incomplete,,,[],,,,,False,2022-09-29 00:00:00+00:00,2022-10-17 23:59:59.999999+00:00


## Using Tbls

This function allows you to query tables from the warehouse as pandas dataframe.


In [7]:
## import the package for querying tables directly from the warehouse, tbls
## also import Siuba for Option 2

from siuba import *
from calitp_data_analysis.tables import tbls

### Option 1: Using tbls

hint: hit `tab` after the period to see what options you have. example: tbls.`tab`

In [8]:
tbls.mart_transit_database.dim_services()

Unnamed: 0,key,source_record_id,name,service_type,mode,operating_counties,gtfs_schedule_status,gtfs_schedule_quality,manual_check__gtfs_realtime_data_ingested_in_trip_planner,manual_check__gtfs_schedule_data_ingested_in_trip_planner,deprecated_date,fixed_route,is_public,public_currently_operating,public_currently_operating_fixed_route,start_date,operational_status,_deprecated__currently_operating,_deprecated__assessment_status,_valid_from,_valid_to,_is_current
0,f6fa8daf4fac9b46443d4786503d0edc,recbJtVSGpX9Ojz2j,,[],[],[],,,,,,,No,,,,,,False,2023-04-06 00:00:00+00:00,2023-04-06 23:59:59.999999+00:00,False
1,e6d6347b6fa61ff42f8ebed41a983c07,recx7nz3W3RRkBRIq,Yes,[],[],[],,,,,,,No,,,,,,False,2023-05-03 00:00:00+00:00,2023-05-04 23:59:59.999999+00:00,False
2,c357000a17918580514a9205d7a36ec3,recZi8sngoKH9jqKZ,DASH,[fixed-route],[bus],[Los Angeles],ok,2 - GTFS data has active service,,,,,,,,,,True,True,2022-12-02 00:00:00+00:00,2023-01-13 23:59:59.999999+00:00,False
3,46f3d92a28bef0b3e0cbac24fda3b79a,recZi8sngoKH9jqKZ,DASH,[fixed-route],[bus],[Los Angeles],ok,2 - GTFS data has active service,,,,,,,,,,True,,2022-06-29 00:00:00+00:00,2022-12-01 23:59:59.999999+00:00,False
4,892d779fc8e06b2a305484d3ba098849,recWExeJgJw1wa7AV,FRAN,[on-demand],[bus],[Orange],needed,,,,,,,,,,,True,,2022-06-29 00:00:00+00:00,2022-12-01 23:59:59.999999+00:00,False


### Option 2: Using Siuba

[Siuba Reference Guide](https://siuba.org/api/)

In [9]:
tbls_services = (
    tbls.mart_transit_database.dim_services()
     >> collect() 
)
## the collect is needed to save the results as a pandas dataframe

In [10]:
tbls_services.head()

Unnamed: 0,key,source_record_id,name,service_type,mode,operating_counties,gtfs_schedule_status,gtfs_schedule_quality,manual_check__gtfs_realtime_data_ingested_in_trip_planner,manual_check__gtfs_schedule_data_ingested_in_trip_planner,deprecated_date,fixed_route,is_public,public_currently_operating,public_currently_operating_fixed_route,start_date,operational_status,_deprecated__currently_operating,_deprecated__assessment_status,_valid_from,_valid_to,_is_current
0,f6fa8daf4fac9b46443d4786503d0edc,recbJtVSGpX9Ojz2j,,[],[],[],,,,,,,No,,,,,,False,2023-04-06 00:00:00+00:00,2023-04-06 23:59:59.999999+00:00,False
1,e6d6347b6fa61ff42f8ebed41a983c07,recx7nz3W3RRkBRIq,Yes,[],[],[],,,,,,,No,,,,,,False,2023-05-03 00:00:00+00:00,2023-05-04 23:59:59.999999+00:00,False
2,c357000a17918580514a9205d7a36ec3,recZi8sngoKH9jqKZ,DASH,[fixed-route],[bus],[Los Angeles],ok,2 - GTFS data has active service,,,,,,,,,,True,True,2022-12-02 00:00:00+00:00,2023-01-13 23:59:59.999999+00:00,False
3,46f3d92a28bef0b3e0cbac24fda3b79a,recZi8sngoKH9jqKZ,DASH,[fixed-route],[bus],[Los Angeles],ok,2 - GTFS data has active service,,,,,,,,,,True,,2022-06-29 00:00:00+00:00,2022-12-01 23:59:59.999999+00:00,False
4,892d779fc8e06b2a305484d3ba098849,recWExeJgJw1wa7AV,FRAN,[on-demand],[bus],[Orange],needed,,,,,,,,,,,True,,2022-06-29 00:00:00+00:00,2022-12-01 23:59:59.999999+00:00,False


In [11]:
## another example table query

In [12]:
bridge_table_tbls = ( 
    tbls.mart_transit_database.bridge_organizations_x_services_managed()
    >> filter(_._is_current == True)
    >> collect() 
)


In [13]:
bridge_table_tbls.sample()

Unnamed: 0,organization_key,service_key,organization_name,service_name,_valid_from,_valid_to,_is_current
212,f3dea1b508e56ee099602255b43cedfb,4ecee70c463fd9458720c664e233844e,City of Hawthorne,Hawthorne Dial-A-Ride Transportation,2023-05-25 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True


## Joining tables with QuerySQL and dataframes from tbls

### Using pandas to join

In [14]:
tbs_join = pd.merge(dim_orgs_sql, bridge_table_tbls, left_on="key", right_on="organization_key", indicator=True)

In [15]:
tbs_join.sample()

Unnamed: 0,key,source_record_id,name,organization_type,roles,itp_id,details,caltrans_district,website,reporting_category,hubspot_company_record_id,gtfs_static_status,gtfs_realtime_status,_deprecated__assessment_status,manual_check__contact_on_website,alias,is_public_entity,ntd_id,public_currently_operating,public_currently_operating_fixed_route,_is_current_x,_valid_from_x,_valid_to_x,organization_key,service_key,organization_name,service_name,_valid_from_y,_valid_to_y,_is_current_y,_merge
540,432ebdfde54e5d6fe2f39fb78d6277c0,reclbzT9trIiGwjBB,Tuolumne County Transit Agency,Independent Agency,[],482.0,,10 - Stockton,https://www.tuolumnecountytransit.com/,Core,1880690880,Static Incomplete,RT Incomplete,True,No,[],True,9R02-91057,True,True,True,2023-06-17 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,432ebdfde54e5d6fe2f39fb78d6277c0,29d5b46d370b915eba9a25548ecf26ae,Tuolumne County Transit Agency,Tuolumne County Transit,2023-06-17 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True,both


### Adding the join into the sql query

In [16]:
# test join using sql as an example of a larger sql query

sql_join = (
    sql.query_sql(f'''
WITH orgs AS (
    SELECT * 
    FROM `cal-itp-data-infra.mart_transit_database.dim_organizations`
    WHERE _is_current = True
    ),
    
bridge AS (
    SELECT * 
    FROM `cal-itp-data-infra.mart_transit_database.bridge_organizations_x_services_managed`
    WHERE _is_current = True
    ), 
    
services AS (
    SELECT *
    FROM `cal-itp-data-infra.mart_transit_database.dim_services`
    WHERE _is_current = True
    ),

join1 AS (
    SELECT * 
    FROM orgs AS T1
    LEFT JOIN bridge AS T2
        ON 
            T1.key = T2.organization_key
    LEFT JOIN services as T3
        ON
            T2.service_key = T3.key
    ) 

SELECT * FROM join1 
'''
                 )
)

In [17]:
sql_join.sample(5)

Unnamed: 0,key,source_record_id,name,organization_type,roles,itp_id,details,caltrans_district,website,reporting_category,hubspot_company_record_id,gtfs_static_status,gtfs_realtime_status,_deprecated__assessment_status,manual_check__contact_on_website,alias,is_public_entity,ntd_id,public_currently_operating,public_currently_operating_fixed_route,_is_current,_valid_from,_valid_to,organization_key,service_key,organization_name,service_name,_valid_from_1,_valid_to_1,_is_current_1,key_1,source_record_id_1,name_1,service_type,mode,operating_counties,gtfs_schedule_status,gtfs_schedule_quality,manual_check__gtfs_realtime_data_ingested_in_trip_planner,manual_check__gtfs_schedule_data_ingested_in_trip_planner,deprecated_date,fixed_route,is_public,public_currently_operating_1,public_currently_operating_fixed_route_1,start_date,operational_status,_deprecated__currently_operating,_deprecated__assessment_status_1,_valid_from_2,_valid_to_2,_is_current_2
989,b740efd508952645b48c2b882555f726,recI6rQVkFzLOV2yY,Avenidas,Non-Profit Organization,[],401.0,,04 - Oakland,https://www.avenidas.org/,,8567080963.0,Static OK,RT Incomplete,False,Unknown,[],False,,False,False,True,2023-05-05 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,b740efd508952645b48c2b882555f726,4af2caa4936b690bfcff381ea5543833,Avenidas,Avenidas Door-to-Door,2023-05-16 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True,4af2caa4936b690bfcff381ea5543833,recsK6NbG9okIQxVe,Avenidas Door-to-Door,"[NEMT, on-demand]",[car/van],[San Mateo],needed,,N/A - no fixed-route service,N/A - no fixed-route service,,,Yes,False,False,,Operating,True,False,2023-05-16 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
1040,51866b562b8ae2243def7d4adf333c32,recgOcejLwLyXyUGr,"Tule River Indian Health Center, Inc.",Company,[],,,06 - Fresno,http://www.trihci.org/outreach/,,,Static OK,RT Incomplete,False,Unknown,[],False,,False,False,True,2023-05-05 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,51866b562b8ae2243def7d4adf333c32,56f69699b45fd5ab9279b238aedeb5ef,"Tule River Indian Health Center, Inc.",Tule River Indian Health Center,2023-05-16 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True,56f69699b45fd5ab9279b238aedeb5ef,receq3UBiYoEvPhoy,Tule River Indian Health Center,"[NEMT, on-demand]",[car/van],[Tulare],needed,,N/A - no fixed-route service,N/A - no fixed-route service,,,Yes,False,False,,Operating,True,False,2023-05-16 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
719,3592db816df3254dc076fb815c783e2a,reczF5Y8R9CUJmfSy,City of Pasadena,City/Town,[],243.0,,07 - Los Angeles,https://www.cityofpasadena.net/transportation/,Core,1880531805.0,Static OK,RT OK,True,Yes,[],True,99424.0,True,True,True,2023-05-23 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,3592db816df3254dc076fb815c783e2a,05c306338b9c363e41018c9b5e695437,City of Pasadena,Pasadena Dial-A-Ride,2023-05-23 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True,05c306338b9c363e41018c9b5e695437,rec3B8poy9I1MxbDu,Pasadena Dial-A-Ride,"[ADA paratransit, on-demand]",[bus],[Los Angeles],needed,,N/A - no fixed-route service,N/A - no fixed-route service,,,No,False,False,,Operating,True,False,2023-05-16 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
525,d8bf13cbee98a8cceeb41520a6f23b0a,recRfiIeNE7hyQ5h3,American Cancer Society,Non-Profit Organization,[],396.0,,,https://www.cancer.org/,,8567108020.0,Static OK,RT Incomplete,False,Unknown,[],False,,False,False,True,2023-05-25 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,d8bf13cbee98a8cceeb41520a6f23b0a,a225be17cda56b7197a0a02347a89440,American Cancer Society,American Cancer Society,2023-05-25 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True,a225be17cda56b7197a0a02347a89440,recVd6dDU5KInfB60,American Cancer Society,[ADA paratransit],[car/van],[],needed,,N/A - no fixed-route service,N/A - no fixed-route service,,,No,False,False,,Operating,True,False,2023-05-17 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
964,a896ecc2a4b19a32d7b54d0d3c6039f2,rec8XA7qMoBtd0Vef,Scripps Mercy Hospital,Non-Profit Organization,[],,,11 - San Diego,https://www.scripps.org/locations/hospitals/sc...,,,Static OK,RT Incomplete,False,Unknown,[],False,,False,False,True,2023-05-27 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,a896ecc2a4b19a32d7b54d0d3c6039f2,73321ecc080ba5baf5a01b109b36fd7d,Scripps Mercy Hospital,Scripps Mercy Hospital,2023-06-03 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True,73321ecc080ba5baf5a01b109b36fd7d,recE4pyc8hxRvfi06,Scripps Mercy Hospital,[NEMT],[car/van],[San Diego],,,N/A - no fixed-route service,N/A - no fixed-route service,,,No,False,False,,Operating,True,False,2023-06-03 00:00:00+00:00,2098-12-31 23:59:59.999999+00:00,True
