#### Extract and load raw data - NTD validator pipeline
This notebook shows the code development steps for the script `check_raw_data.py`
Tentative plan:  
1.	Scan the Google Cloud bucket once a ... day? Scan Sharepoint folder (or hit the API once per day)  
2.	Grab all report files (BlackCat submitted reports are stored in GCS in excel files)  
    a.	Grab all files that start with the filename for each particular report (e.g., "NTD_Annual_Report_Rural_2023_\<dateuploaded>.xlsx" for the RR-20)  
    b.	Function to get the latest one (based on end of filename) - these files have data for ALL ORGS that have submitted so far  
    c.	henceforth referred to as "DATASET A"  
3.	In each, check for all new organizations with a report submitted between last time checked and current time  
    ~a.	Assuming there's a timestamp for last submitted~  
    b.	Grab the appropriate table from the "blackcat_raw" dataset in BigQuery (What we already have)  
    c.	henceforth referred to as "DATASET B"  
4.	For loop to go through by each org in DATASET A and:  
    a.	create temp data frames of their data only, from DATASET A and DATASET B  
    b.	Delete timestamp of date uploaded, from DATASET B  
    c.	compare to see if identical  
    d.	if so, pass  
5.	if not identical, then it's either a new submission or revision  
    a.	Continue on to validation checks  
    b.	Continue on to saving data to "blackcat_raw" data tables  
    c.	add in upload timestamp column  


In [1]:
from google.cloud import bigquery, storage
import pandas as pd
import datetime
import re

**First, connect to Google Cloud**

In [2]:
storage_client = storage.Client(project='cal-itp-data-infra')

In [3]:
bucket_name = "calitp-ntd-report-validation"

bucket = storage_client.get_bucket(bucket_name)

In [4]:
bucket

<Bucket: calitp-ntd-report-validation>

In [7]:
buckets = storage_client.list_buckets()
buckets # <google.api_core.page_iterator.HTTPIterator at 0x7fc650b840d0>

# this lists all the buckets in the Cal ITP project
for bucket in buckets:
        print(bucket.name)

analysis-output-models
artifacts.cal-itp-data-infra.appspot.com
cal-itp-data-infra.appspot.com
calitp-aggregator-scraper
calitp-airtable
calitp-amplitude-benefits-events
calitp-analytics-data
calitp-backups-grafana
calitp-backups-metabase
calitp-backups-sentry
calitp-backups-test
calitp-ci-artifacts
calitp-dbt-artifacts
calitp-elavon-parsed
calitp-elavon-raw
calitp-gtfs-download-config
calitp-gtfs-rt-parsed
calitp-gtfs-rt-raw-deprecated
calitp-gtfs-rt-raw-v2
calitp-gtfs-rt-validation
calitp-gtfs-schedule-parsed
calitp-gtfs-schedule-parsed-hourly
calitp-gtfs-schedule-raw-v2
calitp-gtfs-schedule-unzipped
calitp-gtfs-schedule-unzipped-hourly
calitp-gtfs-schedule-validation
calitp-gtfs-schedule-validation-hourly
calitp-jamesl-gcp-components-tfstate
calitp-map-tiles
calitp-ntd-data-products
calitp-ntd-report-validation
calitp-payments-littlepay-parsed
calitp-payments-littlepay-raw
calitp-prod-gcp-components-tfstate
calitp-publish
calitp-publish-data-analysis
calitp-reports-data
cold-storage

In [12]:
## Not super useful 

blobs = list(bucket.list_blobs())
blobs

[<Blob: us.artifacts.cal-itp-data-infra.appspot.com, containers/images/sha256:012673bfdf8a902e8f90320ba2858e51708cac3e592b684e2c769c4330262fe8, 1630336376393109>,
 <Blob: us.artifacts.cal-itp-data-infra.appspot.com, containers/images/sha256:05f60bd112ef9c0d9ecdbc4bbb59743fea31ad7cdc4aefc7915258731734c9c9, 1630337085328912>,
 <Blob: us.artifacts.cal-itp-data-infra.appspot.com, containers/images/sha256:062ada600c9e0239e04bfc21da3719c4d0db3276450b28c455231f416015ba9f, 1626705521505559>,
 <Blob: us.artifacts.cal-itp-data-infra.appspot.com, containers/images/sha256:0ab5ec7719940ff567e31ea7bcb202c5a70cf97551c77fd8a9ffcffe8d8c6a57, 1630337083471178>,
 <Blob: us.artifacts.cal-itp-data-infra.appspot.com, containers/images/sha256:0bde298e076a8f3680a810ea79dc73250029fba8340b2380c138bfacb0101610, 1634311932186980>,
 <Blob: us.artifacts.cal-itp-data-infra.appspot.com, containers/images/sha256:0c02b2144076aedf7707e7763a5cd07fbd0de4ef631977075076b16c7fa0fc46, 1617909136841535>,
 <Blob: us.artifacts.c

In [10]:
storage_client.list_blobs("calitp-ntd-report-validation")

<google.api_core.page_iterator.HTTPIterator at 0x7f97b87b2520>

### Find the correct bucket, then search for the latest data 
Grab all report files (BlackCat submitted reports are stored in GCS in excel files)  
a. Grab all files that start with the filename for each particular report (e.g., "NTD_Annual_Report_Rural_2023_<dateuploaded>.xlsx" for the RR-20)  
b. Function to get the latest one (based on end of filename) - these files have data for ALL ORGS that have submitted so far  
c. henceforth referred to as "DATASET A"

USING THE `calitp` LIBRARY - using `storage_client.get_bucket('calitp-ntd-report-validation')` doesn't work for me on Airflow despite working here. Authentication issue when trying on Airflow?  
  
https://gcsfs.readthedocs.io/en/latest/api.html

In [17]:
import sys
sys.path.insert(0, "/Users/kim.engie/Dev/caltrans/git_repos/data-infra/packages/calitp-data-infra")
from calitp_data_infra.storage import get_fs


fs = get_fs()
# type(fs)
# fs.ls('calitp-ntd-report-validation')
# type(fs.ls('calitp-ntd-report-validation'))
# print(fs.ls('calitp-ntd-report-validation'))

In [33]:
fs.ls('calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw')
# fs.isdir('calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw')   #False

['calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/A_30_Revenue_Vehicle_Report _2023_2023-10-17.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/A_30_Revenue_Vehicle_Report_2023_2023-10-04.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/A_30_Revenue_Vehicle_Report_2023_2023-10-10.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/A_30_Revenue_Vehicle_Report_2023_2023-10-24.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-09-28.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-05.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-10.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-17.xlsx',
 'calitp-ntd-report-validation/blackcat_ntd_reports_202

In [34]:
# Need dict code table to decipher a) forms to files - will be the BEGINNING of the form name
this_year=datetime.datetime.now().year
# print(this_year)


form_to_file_dict = {
    "RR-20": f"NTD_Annual_Report_Rural_{this_year}",
    "A-30": f"A_30_Revenue_Vehicle_Report_{this_year}",
    "A-10": f"NTD_Stations_and_Maintenace_Facilities_A10_{this_year}",
    "Inventory": "RevenueVehicles"
}


In [35]:
form_to_check = "RR-20"
file_prefix = form_to_file_dict.get(form_to_check) #'NTD_Annual_Report_Rural_2023'
print(file_prefix)

NTD_Annual_Report_Rural_2023


In [38]:
# Get the most recent file in the raw bucket
all_dates=[]
for file in fs.ls('calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw'):
    if file_prefix in file:
        print(file)
        f = file.split('/')[-1]
#         print(f)
        fdate = re.search(r'(\d{4}-\d{2}-\d{2})', f).group()
        all_dates.append(fdate)
        print(all_dates)
    else:
        pass

all_dates.sort(key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"), reverse=True)
print(all_dates)

calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-09-28.xlsx
['2023-09-28']
calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-05.xlsx
['2023-09-28', '2023-10-05']
calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-10.xlsx
['2023-09-28', '2023-10-05', '2023-10-10']
calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-17.xlsx
['2023-09-28', '2023-10-05', '2023-10-10', '2023-10-17']
calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-24.xlsx
['2023-09-28', '2023-10-05', '2023-10-10', '2023-10-17', '2023-10-24']
['2023-10-24', '2023-10-17', '2023-10-10', '2023-10-05', '2023-09-28']


In [49]:
latest_date = all_dates[0]
latest_date

for file in fs.ls('calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw'):
    if (latest_date in file) and (file_prefix in file):
        latest_file = file

latest_file

'calitp-ntd-report-validation/blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-24.xlsx'

### Pull data from latest file, and compare to existing data in BG
Next: cycle through each worksheet, filter by org, pull raw data in BQ and compare.  
* Grab the appropriate table from the "blackcat_raw" dataset in BigQuery (What we already have)
* henceforth referred to as "DATASET B"  
For loop to go through by each org in DATASET A and:  
* create temp data frames of their data only, from DATASET A and DATASET B
* Delete timestamp of date uploaded, from DATASET B
* compare to see if identical
* if so, pass


In [43]:
form_to_sheets_dict = {
        "RR-20": ['Basics.Contacts', 'Modes', 'Expenses By Mode', 'Revenues By Mode', 'Financials - 2', 'Service Data', 'Safety', 'Other Resources'],
        "A-30": ['A-30 (Rural) RVI'],
        "A-10": ['PurchaseTranspFacOwnTypes', 'DirectlyOperatedFacOwnTypes'],
        "Inventory": ['Revenue Vehicles']
    }

In [44]:
# Get the worksheet names for the specific file we're loading.
our_sheets = form_to_sheets_dict.get(form_to_check)
our_sheets

['Basics.Contacts',
 'Modes',
 'Expenses By Mode',
 'Revenues By Mode',
 'Financials - 2',
 'Service Data',
 'Safety',
 'Other Resources']

In [45]:
def load_excel_data(filepath, sheetname):
    df = pd.read_excel(filepath,
                        sheet_name=sheetname,
                        index_col=None)
    return df


In [46]:
# This is only for getting a list of the subrecipients that are in the raw data - 
# if we're checking the RR-20, pull from the 2nd sheet (skip the 1st contacts info cuz it looks like it always lists everyone, regardless of who actually submitted something)
# all other forms have only 1 worksheet so just grab that one.

if form_to_check == "RR-20":
    sheet = form_to_sheets_dict.get(form_to_check)[1]
else:
    sheet = form_to_sheets_dict.get(form_to_check)[0]

In [47]:
sheet

'Modes'

In [50]:
sheet = 'Expenses By Mode'
df = load_excel_data(f"gs://{latest_file}", sheet)
# df = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheet)
df

Unnamed: 0,Organization Legal Name,Common Name/Acronym/DBA,Fiscal Year,Operating/Capital,Mode,Total Annual Expenses By Mode
0,Amador Transit,,2023,Capital,Commuter Bus (CB) - (DO),0.0
1,Amador Transit,,2023,Capital,Demand Response (DR) - (DO),0.0
2,Amador Transit,,2023,Capital,Deviated Fixed Route (DF) - (DO),0.0
3,Amador Transit,,2023,Operating,Commuter Bus (CB) - (DO),196746.0
4,Amador Transit,,2023,Operating,Demand Response (DR) - (DO),394415.0
...,...,...,...,...,...,...
145,Tuolumne County Transit Agency (TCTA),TCT,2023,Capital,Demand Response (DR) - (PT),0.0
146,Tuolumne County Transit Agency (TCTA),TCT,2023,Operating,Bus (MB) (Fixed Route) - (PT),997518.0
147,Tuolumne County Transit Agency (TCTA),TCT,2023,Operating,Demand Response (DR) - (PT),1436041.0
148,Yosemite Area Regional Transportation System,YARTS,2023,Capital,Bus (MB) (Fixed Route) - (PT),2429189.0


In [17]:
orgs_in_file = df['Organization Legal Name'].unique()
orgs_in_file

array(['Alpine County Community Development', 'Amador Transit',
       'Calaveras Transit Agency ', 'City of Arcata', 'City of Arvin',
       'City of Auburn', 'City of California City', 'City of Chowchilla ',
       'City of Corcoran - Corcoran Area Transit', 'City of Dixon',
       'City of Guadalupe', 'City of McFarland', 'City of Needles',
       'City of Ridgecrest', 'City of Rio Vista', 'City of Shafter',
       'City of Solvang', 'City of Taft', 'City of Tehachapi',
       'Colusa County Transit Agency', 'County of Mariposa',
       'County of Nevada Public Works, Transit Services Division',
       'County of Sacramento Department of Transportation',
       'County of Shasta Department of Public Works',
       'Eastern Sierra Transit Authority', 'Eureka Transit Service',
       'Fresno County Rural Transit Agency',
       'Glenn County Transportation Commission',
       'Humboldt Transit Authority',
       'Kern Regional Transit (County of Kern)', 'Lake Transit Authority',
     

In [31]:
org = 'Alpine County Community Development'
df = df[df['Organization Legal Name']== org]

In [32]:
df

Unnamed: 0,Organization Legal Name,Common Name/Acronym/DBA,Fiscal Year,Operating/Capital,Mode,Total Annual Expenses By Mode
0,Alpine County Community Development,,2023,Capital,Demand Response (DR) - (DO),0.0
1,Alpine County Community Development,,2023,Operating,Demand Response (DR) - (DO),81766.0


In [33]:
df.dtypes

Organization Legal Name           object
Common Name/Acronym/DBA           object
Fiscal Year                        int64
Operating/Capital                 object
Mode                              object
Total Annual Expenses By Mode    float64
dtype: object

In [16]:
orgs = pd.read_csv("../data/organizations.csv")
orgs_to_submit = orgs['Organization'].unique() # I see some whitespaces
orgs_to_submit = [x.strip(' ') for x in orgs_to_submit]
orgs_to_submit

['Alpine County Community Development',
 'Amador Transit',
 'Butte County Association of Governments/ Butte Regional Transit',
 'Calaveras Transit Agency',
 'City of Arvin',
 'City of Auburn',
 'City of California City',
 'City of Chowchilla',
 'City of Corcoran - Corcoran Area Transit',
 'City of Dinuba',
 'City of Dixon',
 'City of Escalon',
 'City of Eureka/Eureka Transit Service',
 'City of Guadalupe',
 'City of McFarland',
 'City of Needles',
 'City of Ojai',
 'City of Ridgecrest',
 'City of Rio Vista',
 'City of Santa Maria',
 'City of Shafter',
 'City of Solvang',
 'City of Taft',
 'City of Tehachapi',
 'City of Visalia',
 'City of Wasco',
 'City of Woodlake',
 'County of Colusa/ Colusa County Transit Agency',
 'County of Glenn Transit Service/Glenn County Transportation Commission',
 'County of Los Angeles - Department of Public Works',
 'County of Mariposa',
 'County of Nevada Public Works, Transit Services Division',
 'County of Plumas/Plumas County Transportation Commission'

In [114]:
'City of Arvin' in orgs_to_submit #make an if check in later function

df[df['Organization Legal Name']== 'City of Arvin']

Unnamed: 0,Organization Legal Name,Common Name/Acronym/DBA,Fiscal Year,Mode,Type of Service,Commitment Date,Start Date,End Date
0,City of Arvin,,2023,Demand Response (DR),DO - Directly Operated,,,
1,City of Arvin,,2023,Deviated Fixed Route (DF),DO - Directly Operated,,,


---
### Now we have the raw data coming in, for one subrecipient.
Now we must pull that data from  BigQuery  
Finally, eventually,  check against the data in BigQuery.  
  
TO TEST:  
First, I will load some of the newest data into BigQuery already. This is a 1-time event to set up tables, and infer schemas and table names. I loaded **only Tehama** so that when we test loading in new data, we have an example of some data that should be skipped over and some that should be written.


FIRST, load in Tehama's data

In [18]:
# Construct a BigQuery client object.
client = bigquery.Client()

In [19]:
# Get data from GCS - RR020 from 2023

rr20_service =  load_excel_data(f"gs://{bucket_name}/{latest_file}",sheetname="Service Data")
rr20_exp_by_mode = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheetname="Expenses By Mode")
rr20_rev_by_mode = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheetname="Revenues By Mode")
rr20_fin = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheetname="Financials - 2")
rr20_safety = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheetname="Safety")
rr20_other = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheetname="Other Resources")
rr20_contactinfo = load_excel_data(f"gs://{bucket_name}/{latest_file}", sheetname="Basics.Contacts")
rr20_modes = load_excel_data(f"gs://{bucket_name}/{latest_file}",sheetname="Modes")

In [20]:
# Cut down dataframes to only Tehama
rr20_service = rr20_service[rr20_service['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_exp_by_mode = rr20_exp_by_mode[rr20_exp_by_mode['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_rev_by_mode = rr20_rev_by_mode[rr20_rev_by_mode['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_fin = rr20_fin[rr20_fin['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_safety = rr20_safety[rr20_safety['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_other = rr20_other[rr20_other['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_contactinfo = rr20_contactinfo[rr20_contactinfo['Organization Legal Name']== 'Tehama County Transit Agency']
rr20_modes = rr20_modes[rr20_modes['Organization Legal Name']== 'Tehama County Transit Agency']

In [21]:
# Add column for date uploaded
# upload_d = datetime.datetime.now().date() #.strftime('%Y-%m-%d')
upload_d = pd.to_datetime(datetime.datetime(2023,9,29))
rr20_service['date_uploaded'] = pd.to_datetime(upload_d)
rr20_exp_by_mode['date_uploaded'] = pd.to_datetime(upload_d)
rr20_rev_by_mode['date_uploaded'] = pd.to_datetime(upload_d)
rr20_fin['date_uploaded'] = pd.to_datetime(upload_d)
rr20_safety['date_uploaded'] = pd.to_datetime(upload_d)
rr20_other['date_uploaded'] = pd.to_datetime(upload_d)
rr20_contactinfo['date_uploaded'] = pd.to_datetime(upload_d)
rr20_modes['date_uploaded'] = pd.to_datetime(upload_d)

In [22]:
rr20_modes.head(3)

Unnamed: 0,Organization Legal Name,Common Name/Acronym/DBA,Fiscal Year,Mode,Type of Service,Commitment Date,Start Date,End Date,date_uploaded
77,Tehama County Transit Agency,TRAX,2023,Bus (MB) (Fixed Route),PT - Purchased Transportation,NaT,NaT,NaT,2023-09-29
78,Tehama County Transit Agency,TRAX,2023,Demand Response (DR),PT - Purchased Transportation,NaT,NaT,NaT,2023-09-29


In [23]:
rr20_modes.dtypes

Organization Legal Name            object
Common Name/Acronym/DBA            object
Fiscal Year                         int64
Mode                               object
Type of Service                    object
Commitment Date            datetime64[ns]
Start Date                 datetime64[ns]
End Date                   datetime64[ns]
date_uploaded              datetime64[ns]
dtype: object

In [28]:
rr20_modes.columns = (rr20_modes.columns.str.replace(' ', '_').str.replace('/', '_')
                 .str.replace('.', '_').str.replace('-', '')
                 .str.replace('#', 'num')
                 .str.replace('\W+', ''))

  rr20_modes.columns = (rr20_modes.columns.str.replace(' ', '_').str.replace('/', '_')
  rr20_modes.columns = (rr20_modes.columns.str.replace(' ', '_').str.replace('/', '_')


In [29]:
rr20_modes.columns

Index(['Organization_Legal_Name', 'Common_Name_Acronym_DBA', 'Fiscal_Year',
       'Mode', 'Type_of_Service', 'Commitment_Date', 'Start_Date', 'End_Date',
       'date_uploaded'],
      dtype='object')

In [203]:
rr20_service_tehama = rr20_service.copy()

### Load and create 2023 RR-20 tables in BigQuery 
This code is reused with slight modifications from `ETL_Load_data_GCS_to_BQ.ipynb`  
  
**IF YOU RUN IT, IT WILL ERROR OUT.**
It was run once to initially create the 2023 raw tabels in BigQuery. That is its sole purpose, and we don't need it anymore. To see how it was developed, see the notebook `ETL_Load_data_GCS_to_BQ.ipynb`  

In [437]:
# Tables to create and load - we need the df and a string name for the next loop.
# We carefully make the keys (which will be the BQ table names) exactly the same thing as the Excel worksheets 
# except in lowercase and replacing spaces, "/", "." and "-" with _ 
dfdict = {"rr20_modes": rr20_modes,
          "rr20_service_data": rr20_service, 
          "rr20_expenses_by_mode": rr20_exp_by_mode, 
          "rr20_revenues_by_mode": rr20_rev_by_mode,
          "rr20_financials__2": rr20_fin,
          "rr20_safety": rr20_safety, 
          "rr20_other_resources": rr20_other, 
          "rr20_basics_contacts": rr20_contactinfo}
          
for k,v in dfdict.items():
    table_id = f"cal-itp-data-infra.blackcat_raw.{this_year}_{k}" # Set table_id 

    # Remove spaces and slashes from col names - - they are illegal in BQ
    v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
                 .str.replace('.', '_').str.replace('-', '')
                 .str.replace('#', 'num')
                 .str.replace('\W+', '') #other things, just strip out
                )
    columns = v.columns.values
    
    # Make dict of colname: BQ type
    schema_dict = {}
    for x in columns:
        if v[x].dtypes == 'float64':
            schema_dict[x] = "FLOAT64"
        elif v[x].dtypes == 'int64':
            schema_dict[x] = "INT64"
        elif v[x].dtypes == 'object':
            schema_dict[x] =  "STRING"
        elif v[x].dtypes == 'datetime64[ns]':
            schema_dict[x] =  "DATETIME"
    
    schema = []
    for k2, v2 in schema_dict.items():
        schema.append(bigquery.SchemaField(k2, v2)) 
    
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table) # API request to create table
    print(f"Created table {table.project}.{table.dataset_id}.{table_id}")

    #https://cloud.google.com/bigquery/docs/samples/bigquery-load-table-dataframe
    job_service = client.load_table_from_dataframe(
        v, table_id)  
    job_service.result()  # Wait for the job to complete.
    table = client.get_table(table_id) # API request to load data

    print(f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id}")

  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_modes
Loaded 2 rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_service_data
Loaded 2 rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode
Loaded 4 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode
Loaded 8 rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2
Loaded 2 rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_safety
Loaded 0 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources
Loaded 1 rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources


  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')
  v.columns = (v.columns.str.replace(' ', '_').str.replace('/', '_')


Created table cal-itp-data-infra.blackcat_raw.cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts
Loaded 1 rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts


## Now compare raw data in GCS Bucket to data in BigQuery
Check for and skip over what is there, write anything new. To verify code will work for every table, we cycle through each worksheet 1 by 1 in the code below. Meaning I switch out the sheetname in `sheet = "Other Resources"` and in the `load_excel_data` step:  
```['Basics.Contacts',
 'Modes',
 'Expenses By Mode',
 'Revenues By Mode',
 'Financials - 2',
 'Service Data',
 'Safety',
 'Other Resources']```

In [119]:
bq_form_ref = form_to_check.replace("-","").lower()
bq_form_ref

'rr20'

In [379]:
sheet = "Other Resources"

bq_sheet_ref = sheet.replace(" ", "_").replace("/", "_").replace(".", "_").replace("-", "").replace('\W+', '').lower()
bq_sheet_ref

'other_resources'

In [380]:
# Set table_id to the ID of the table to check.
#----------- Service
table_id = f"cal-itp-data-infra.blackcat_raw.{this_year}_{bq_file_ref}_{bq_sheet_ref}"
table_id


'cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources'

In [418]:
org_to_check = 'City of Arvin' # 'Tehama County Transit Agency' 
sql = f"""SELECT * from blackcat_raw.{this_year}_{bq_file_ref}_{bq_sheet_ref}
        WHERE Organization_Legal_Name = '{org_to_check}'
        
         """
print(sql)
bq = client.query(sql).to_dataframe()
bq

SELECT * from blackcat_raw.2023_rr20_other_resources
        WHERE Organization_Legal_Name = 'City of Arvin'
        
         


Unnamed: 0,Organization_Legal_Name,Common_Name_Acronym_DBA,Fiscal_Year,Number_of_Volunteer_Drivers,Number_of_Personal_Vehicles_in_Service,date_uploaded
0,City of Arvin,,2023,0,0,2023-10-02


In [402]:
# bq = bq.drop_duplicates()
# print(np.isnan(bq['Common_Name_Acronym_DBA'][0]))
bq

Unnamed: 0,Organization_Legal_Name,Common_Name_Acronym_DBA,Fiscal_Year,Number_of_Volunteer_Drivers,Number_of_Personal_Vehicles_in_Service,date_uploaded
0,City of Arvin,,2023,0,0,2023-09-29


In [403]:
raw_data = load_excel_data(f"gs://{bucket_name}/{latest_file}",sheetname="Other Resources")
raw_data = raw_data[raw_data['Organization Legal Name']== org_to_check]
upload_d = datetime.datetime.now().date() #.strftime('%Y-%m-%d')
raw_data['date_uploaded'] = pd.to_datetime(upload_d) # pd.to_datetime(datetime.datetime(2023,9,29))

raw_data

Unnamed: 0,Organization Legal Name,Common Name/Acronym/DBA,Fiscal Year,Number of Volunteer Drivers,Number of Personal Vehicles in Service,date_uploaded
0,City of Arvin,,2023,0,0,2023-10-02


In [404]:
raw_data.rename(columns=lambda x: x.strip(), inplace=True) #first remove trailing/leading whitespace
raw_data.columns = (raw_data.columns.str.replace(' ', '_') #other whitespaces are replaced with an underscore
                                    .str.replace('/', '_').str.replace('.', '_')
                                     .str.replace('-', '')
                                    .str.replace('#', 'num')
                                    .str.replace('\W+', '') #other things, just strip out
                                    )
raw_data

  raw_data.columns = (raw_data.columns.str.replace(' ', '_') #other whitespaces are replaced with an underscore
  raw_data.columns = (raw_data.columns.str.replace(' ', '_') #other whitespaces are replaced with an underscore


Unnamed: 0,Organization_Legal_Name,Common_Name_Acronym_DBA,Fiscal_Year,Number_of_Volunteer_Drivers,Number_of_Personal_Vehicles_in_Service,date_uploaded
0,City of Arvin,,2023,0,0,2023-10-02


**Test different ways of checking for datasets being identical**

In [419]:
bq_compare = bq.drop(['date_uploaded'], axis=1)
bq_compare.sort_values(by=bq_compare.columns.tolist()).reset_index(drop=True)


Unnamed: 0,Organization_Legal_Name,Common_Name_Acronym_DBA,Fiscal_Year,Number_of_Volunteer_Drivers,Number_of_Personal_Vehicles_in_Service
0,City of Arvin,,2023,0,0


In [405]:
# THIS WORKS IF WE:
# use check_dtype=False, otherwise it does not work (error is Attribute "dtype" are different [left]: Int64 [right]: int64)
# drop indexes and reset, otherwise if index #s are different will say dataframes differ (irrelevant here)
# resort columns because if rows are in different order, will error
try:
    pd.testing.assert_frame_equal(bq.sort_values(by=bq.columns.tolist())
                                  .reset_index(drop=True), 
                              raw_data.sort_values(by=raw_data.columns.tolist())
                                  .reset_index(drop=True), 
                              check_dtype=False)
    print("data equals....etc. message")
except Exception as ex:
    print(ex)

'date_uploaded'


In [351]:
raw_data.equals(bq)

False

In [353]:
## this one ignores the index and just checks the values
# BUT 'None and 'NaN' are not equal so its not great for checking nulls in numerical and string fields.
import numpy as np

np.array_equal(data.values,rr20_modes.values) #, equal_nan=True
# np.allclose(data,rr20_modes, equal_nan=True, rtol=0, atol=0)

False

**Now sort through upload dates and grab the latest one**

TO DO: sift through Tehama in BQ, grab only the latest date set. Need RunID?

In [238]:
upload_dates = data_tehama['date_uploaded'].unique()
upload_dates

array(['2023-09-29T00:00:00.000000000'], dtype='datetime64[ns]')

In [247]:
# upload_dates.sort()
# upload_dates

test = [pd.to_datetime(datetime.datetime(2023, 9, 21)), 
        pd.to_datetime(datetime.datetime(2023, 9, 20)), 
        pd.to_datetime(datetime.datetime(2023, 10, 1))]
test.sort()
max(test)

Timestamp('2023-10-01 00:00:00')

In [248]:
data_tehama[data_tehama['date_uploaded']==max(upload_dates)]

Unnamed: 0,Organization_Legal_Name,Common_Name_Acronym_DBA,Fiscal_Year,Number_of_Volunteer_Drivers,Number_of_Personal_Vehicles_in_Service,date_uploaded
1,Tehama County Transit Agency,TRAX,2023,0,0,2023-09-29


### Now we have the raw data already in BigQuery, for 1 subrecipient

In [122]:
bucket_name = "calitp-ntd-report-validation"
our_sheets

['Basics.Contacts',
 'Modes',
 'Expenses By Mode',
 'Revenues By Mode',
 'Financials - 2',
 'Service Data',
 'Safety',
 'Other Resources']

In [161]:
# Checking that we can automatically get matching table names from the worksheets:
for sheet in our_sheets:
    bq_sheet_ref = sheet.replace(" ", "_").replace("/", "_").replace(".", "_").replace("-", "").replace('\W+', '').lower()
    print(bq_sheet_ref)


basics_contacts
modes
expenses_by_mode
revenues_by_mode
financials__2
service_data
safety
other_resources


**Now adding in a logger.** Much better than using print statements to show progress during a job

In [430]:
def write_to_log(logfilename):
    '''
    Creates a logger object that outputs to a log file, to the filename specified,
    and also streams to console.
    '''
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s',
                                  datefmt='%y-%m-%d %H:%M:%S')
    file_handler = logging.FileHandler(logfilename)
    file_handler.setFormatter(formatter)
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)

    if not logger.hasHandlers():
        logger.addHandler(file_handler)
        logger.addHandler(stream_handler)

    return logger

In [432]:
import logging 

# Set up the logger object
logger = write_to_log('load_raw_data_output.log')

In [440]:
### FINAL FUNCTION - takes the above code snippets and rolls it up into one function.
# If I put in org for an arg, will have to loop this through the orgs in the raw data. 
# But that keeps this function simpler so that's how it is written

def compare_datasets(form_to_check, latest_file, logger, org):
    form_to_sheets_dict = {
        "RR-20": ['Basics.Contacts', 'Modes', 'Expenses By Mode', 'Revenues By Mode', 'Financials - 2', 'Service Data', 'Safety', 'Other Resources'],
        "A-30": ['A-30 (Rural) RVI']
    }
    
    excelsheets = form_to_sheets_dict.get(form_to_check) #get Excel sheetnames depending on form
    bq_form_ref = form_to_check.replace("-","").lower() #this is something like "rr20" for the "RR-20" form_to_check
    
    # Load incoming data, worksheet by worksheet
    for sheet in excelsheets:
        bq_sheet_ref = sheet.replace(" ", "_").replace("/", "_").replace(".", "_").replace("-", "").replace('\W+', '').lower()
        logger.info(f"Checking data for {org} from {bq_file_ref}_{bq_sheet_ref}")
        incoming_df = load_excel_data(f"gs://{bucket_name}/{latest_file}",sheetname=sheet)
        incoming_org_data = incoming_df[incoming_df['Organization Legal Name']== org].copy()
        
        
        # Now we have only 1 org's data. Remove spaces and slashes from col names - they are illegal in BQ
        incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_', regex=True)
                                    .str.replace('/', '_').str.replace('.', '_', regex=True)
                                     .str.replace('-', '', regex=True)
                                     .str.replace('#', 'num', regex=True)
                                    .str.replace('\W+', '', regex=True) #other things, just strip out
                                    )
        
        # Check what data is already in BQ
        existing_data_query = f"""SELECT * from blackcat_raw.{this_year}_{bq_form_ref}_{bq_sheet_ref}
            WHERE Organization_Legal_Name = '{org}'"""
        client = bigquery.Client()
        bq_data = client.query(existing_data_query).to_dataframe()
        bq_data = bq_data.drop_duplicates()
                
        logger.info(f"Found {len(bq_data)} rows in {bq_form_ref}_{bq_sheet_ref} for {org}")
        
        table_id = f"cal-itp-data-infra.blackcat_raw.{this_year}_{bq_form_ref}_{bq_sheet_ref}"
        table = bigquery.Table(table_id) 
        job_config = bigquery.LoadJobConfig(
            create_disposition="CREATE_IF_NEEDED",
            write_disposition="WRITE_APPEND"
        )
                
        if (len(bq_data) > 0) and (len(incoming_org_data) > 0 ):
            # Get the data with the latest upload date only - because this table serves as running storage for every report submittal.
            upload_dates = bq_data['date_uploaded'].unique()
            upload_dates.sort()
            bq_data = bq_data[bq_data['date_uploaded']==max(upload_dates)]
            bq_compare = bq_data.drop(['date_uploaded'], axis=1)

            try:
                logger.info("Checking for existing data")
                pd.testing.assert_frame_equal(bq_compare.sort_values(by=bq_compare.columns.tolist())
                                              .reset_index(drop=True), 
                                          incoming_org_data.sort_values(by=incoming_org_data.columns.tolist())
                                              .reset_index(drop=True), 
                                          check_dtype=False)
                logger.info(f"{org} data in {bq_form_ref}_{bq_sheet_ref} is already in BigQuery, not writing.")
                pass
            except Exception as ex:
                logger.info(f"Data tables are not the same, with {type(ex).__name__}: {ex}.")

                incoming_org_data.loc[:, 'date_uploaded'] = pd.to_datetime(datetime.datetime.now().date()) # Add in 'date_uploaded' column 
                job_service = client.load_table_from_dataframe(incoming_org_data, table_id, job_config=job_config)  # API request to load data
                job_service.result()  # Wait for the job to complete.
                table = client.get_table(table_id) 
        elif len(incoming_org_data) == 0:
            logger.info(f"No incoming data for {table_id}, skipping.")
            pass
        else:
            logger.info(f"Did not find existing data in {table_id} for {org}, loading new raw data.")
            incoming_org_data.loc[:, 'date_uploaded'] = pd.to_datetime(datetime.datetime.now().date()) # Add in 'date_uploaded' column 
            job_service = client.load_table_from_dataframe(incoming_org_data, table_id, job_config=job_config)  # API request to load data
            job_service.result()  # Wait for the job to complete.
            table = client.get_table(table_id)     
        
        logger.info(f"Loaded {table.num_rows} rows and {len(table.schema)} columns to {table_id} for {org}")
        
        

In [429]:
# First I ran only Tehama's data, if the function is working as it should, nothing should be written to BigQuery
# since it's already there.
compare_datasets("RR-20", latest_file, 'Tehama County Transit Agency')

Checking data for Tehama County Transit Agency from rr20_basics_contacts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 2 rows in rr20_basics_contacts for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_basics_contacts is already in BigQuery, not writing.
Loaded None rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_modes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 4 rows in rr20_modes for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_modes is already in BigQuery, not writing.
Loaded None rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_expenses_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 8 rows in rr20_expenses_by_mode for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_expenses_by_mode is already in BigQuery, not writing.
Loaded None rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_revenues_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 16 rows in rr20_revenues_by_mode for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_revenues_by_mode is already in BigQuery, not writing.
Loaded None rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_financials__2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 4 rows in rr20_financials__2 for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_financials__2 is already in BigQuery, not writing.
Loaded None rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_service_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 4 rows in rr20_service_data for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_service_data is already in BigQuery, not writing.
Loaded None rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_safety


  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 0 rows in rr20_safety for Tehama County Transit Agency
Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_safety for Tehama County Transit Agency, loading new raw data.
Loaded 1 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety for Tehama County Transit Agency
Checking data for Tehama County Transit Agency from rr20_other_resources


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')


Found 2 rows in rr20_other_resources for Tehama County Transit Agency
Testing for equality.
Tehama County Transit Agency data in rr20_other_resources is already in BigQuery, not writing.
Loaded None rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for Tehama County Transit Agency


In [439]:
# Now, run through all orgs in the latest raw data file. All are written, except for Tehama's data, which is skipped
# Works as it should

for org in orgs_in_file:
    if org in orgs_to_submit:
        compare_datasets("RR-20", latest_file, logger, org)

Checking data for City of Arvin from rr20_basics_contacts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:08:INFO: Found 0 rows in rr20_basics_contacts for City of Arvin
23-10-02 14:34:08:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for City of Arvin, loading new raw data.
23-10-02 14:34:12:INFO: Loaded 2 rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for City of Arvin


Checking data for City of Arvin from rr20_modes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:14:INFO: Found 0 rows in rr20_modes for City of Arvin
23-10-02 14:34:14:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_modes for City of Arvin, loading new raw data.
23-10-02 14:34:17:INFO: Loaded 4 rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes for City of Arvin


Checking data for City of Arvin from rr20_expenses_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:19:INFO: Found 0 rows in rr20_expenses_by_mode for City of Arvin
23-10-02 14:34:19:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for City of Arvin, loading new raw data.
23-10-02 14:34:21:INFO: Loaded 8 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for City of Arvin


Checking data for City of Arvin from rr20_revenues_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:24:INFO: Found 0 rows in rr20_revenues_by_mode for City of Arvin
23-10-02 14:34:24:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for City of Arvin, loading new raw data.
23-10-02 14:34:27:INFO: Loaded 16 rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for City of Arvin


Checking data for City of Arvin from rr20_financials__2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:29:INFO: Found 0 rows in rr20_financials__2 for City of Arvin
23-10-02 14:34:29:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for City of Arvin, loading new raw data.
23-10-02 14:34:32:INFO: Loaded 4 rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for City of Arvin


Checking data for City of Arvin from rr20_service_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:34:INFO: Found 0 rows in rr20_service_data for City of Arvin
23-10-02 14:34:34:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for City of Arvin, loading new raw data.
23-10-02 14:34:37:INFO: Loaded 4 rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for City of Arvin


Checking data for City of Arvin from rr20_safety


  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:39:INFO: Found 0 rows in rr20_safety for City of Arvin
23-10-02 14:34:39:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_safety for City of Arvin, loading new raw data.
23-10-02 14:34:46:INFO: Loaded 0 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety for City of Arvin


Checking data for City of Arvin from rr20_other_resources


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:48:INFO: Found 0 rows in rr20_other_resources for City of Arvin
23-10-02 14:34:48:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for City of Arvin, loading new raw data.
23-10-02 14:34:51:INFO: Loaded 2 rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for City of Arvin


Checking data for City of Corcoran - Corcoran Area Transit from rr20_basics_contacts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:53:INFO: Found 0 rows in rr20_basics_contacts for City of Corcoran - Corcoran Area Transit
23-10-02 14:34:53:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:34:56:INFO: Loaded 3 rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_modes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:34:58:INFO: Found 0 rows in rr20_modes for City of Corcoran - Corcoran Area Transit
23-10-02 14:34:58:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_modes for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:01:INFO: Loaded 5 rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_expenses_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:03:INFO: Found 0 rows in rr20_expenses_by_mode for City of Corcoran - Corcoran Area Transit
23-10-02 14:35:03:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:06:INFO: Loaded 10 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_revenues_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:08:INFO: Found 0 rows in rr20_revenues_by_mode for City of Corcoran - Corcoran Area Transit
23-10-02 14:35:08:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:11:INFO: Loaded 20 rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_financials__2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:13:INFO: Found 0 rows in rr20_financials__2 for City of Corcoran - Corcoran Area Transit
23-10-02 14:35:13:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:17:INFO: Loaded 6 rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_service_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:20:INFO: Found 0 rows in rr20_service_data for City of Corcoran - Corcoran Area Transit
23-10-02 14:35:20:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:25:INFO: Loaded 5 rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_safety


  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:28:INFO: Found 0 rows in rr20_safety for City of Corcoran - Corcoran Area Transit
23-10-02 14:35:28:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_safety for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:30:INFO: Loaded 1 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety for City of Corcoran - Corcoran Area Transit


Checking data for City of Corcoran - Corcoran Area Transit from rr20_other_resources


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:33:INFO: Found 0 rows in rr20_other_resources for City of Corcoran - Corcoran Area Transit
23-10-02 14:35:33:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for City of Corcoran - Corcoran Area Transit, loading new raw data.
23-10-02 14:35:36:INFO: Loaded 3 rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for City of Corcoran - Corcoran Area Transit


Checking data for County of Shasta Department of Public Works from rr20_basics_contacts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:39:INFO: Found 0 rows in rr20_basics_contacts for County of Shasta Department of Public Works
23-10-02 14:35:39:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:35:41:INFO: Loaded 4 rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_modes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:43:INFO: Found 0 rows in rr20_modes for County of Shasta Department of Public Works
23-10-02 14:35:43:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_modes for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:35:46:INFO: Loaded 7 rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_expenses_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:48:INFO: Found 0 rows in rr20_expenses_by_mode for County of Shasta Department of Public Works
23-10-02 14:35:48:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:35:51:INFO: Loaded 14 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_revenues_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:53:INFO: Found 0 rows in rr20_revenues_by_mode for County of Shasta Department of Public Works
23-10-02 14:35:53:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:35:57:INFO: Loaded 28 rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_financials__2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:35:59:INFO: Found 0 rows in rr20_financials__2 for County of Shasta Department of Public Works
23-10-02 14:35:59:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:36:01:INFO: Loaded 8 rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_service_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:03:INFO: Found 0 rows in rr20_service_data for County of Shasta Department of Public Works
23-10-02 14:36:03:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:36:07:INFO: Loaded 7 rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_safety


  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:09:INFO: Found 0 rows in rr20_safety for County of Shasta Department of Public Works
23-10-02 14:36:09:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_safety for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:36:12:INFO: Loaded 1 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety for County of Shasta Department of Public Works


Checking data for County of Shasta Department of Public Works from rr20_other_resources


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:14:INFO: Found 0 rows in rr20_other_resources for County of Shasta Department of Public Works
23-10-02 14:36:14:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for County of Shasta Department of Public Works, loading new raw data.
23-10-02 14:36:17:INFO: Loaded 4 rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for County of Shasta Department of Public Works


Checking data for Plumas County Transportation Commission from rr20_basics_contacts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:19:INFO: Found 0 rows in rr20_basics_contacts for Plumas County Transportation Commission
23-10-02 14:36:19:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:36:24:INFO: Loaded 5 rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_modes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:26:INFO: Found 0 rows in rr20_modes for Plumas County Transportation Commission
23-10-02 14:36:26:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_modes for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:36:34:INFO: Loaded 8 rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_expenses_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:37:INFO: Found 0 rows in rr20_expenses_by_mode for Plumas County Transportation Commission
23-10-02 14:36:37:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:36:39:INFO: Loaded 16 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_revenues_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:41:INFO: Found 0 rows in rr20_revenues_by_mode for Plumas County Transportation Commission
23-10-02 14:36:41:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:36:45:INFO: Loaded 32 rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_financials__2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:47:INFO: Found 0 rows in rr20_financials__2 for Plumas County Transportation Commission
23-10-02 14:36:47:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:36:50:INFO: Loaded 10 rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_service_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:52:INFO: Found 0 rows in rr20_service_data for Plumas County Transportation Commission
23-10-02 14:36:52:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:36:55:INFO: Loaded 8 rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_safety


  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:36:57:INFO: Found 0 rows in rr20_safety for Plumas County Transportation Commission
23-10-02 14:36:57:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_safety for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:37:00:INFO: Loaded 1 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety for Plumas County Transportation Commission


Checking data for Plumas County Transportation Commission from rr20_other_resources


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:02:INFO: Found 0 rows in rr20_other_resources for Plumas County Transportation Commission
23-10-02 14:37:02:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for Plumas County Transportation Commission, loading new raw data.
23-10-02 14:37:05:INFO: Loaded 5 rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for Plumas County Transportation Commission


Checking data for Tehama County Transit Agency from rr20_basics_contacts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:06:INFO: Found 1 rows in rr20_basics_contacts for Tehama County Transit Agency
23-10-02 14:37:06:INFO: Checking for existing data
23-10-02 14:37:06:INFO: Tehama County Transit Agency data in rr20_basics_contacts is already in BigQuery, not writing.
23-10-02 14:37:06:INFO: Loaded None rows and 15 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_basics_contacts for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_modes


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:08:INFO: Found 2 rows in rr20_modes for Tehama County Transit Agency
23-10-02 14:37:08:INFO: Checking for existing data
23-10-02 14:37:08:INFO: Tehama County Transit Agency data in rr20_modes is already in BigQuery, not writing.
23-10-02 14:37:08:INFO: Loaded None rows and 9 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_modes for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_expenses_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:10:INFO: Found 4 rows in rr20_expenses_by_mode for Tehama County Transit Agency
23-10-02 14:37:10:INFO: Checking for existing data
23-10-02 14:37:10:INFO: Tehama County Transit Agency data in rr20_expenses_by_mode is already in BigQuery, not writing.
23-10-02 14:37:10:INFO: Loaded None rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_expenses_by_mode for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_revenues_by_mode


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:12:INFO: Found 8 rows in rr20_revenues_by_mode for Tehama County Transit Agency
23-10-02 14:37:12:INFO: Checking for existing data
23-10-02 14:37:12:INFO: Tehama County Transit Agency data in rr20_revenues_by_mode is already in BigQuery, not writing.
23-10-02 14:37:12:INFO: Loaded None rows and 8 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_revenues_by_mode for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_financials__2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:15:INFO: Found 2 rows in rr20_financials__2 for Tehama County Transit Agency
23-10-02 14:37:15:INFO: Checking for existing data
23-10-02 14:37:15:INFO: Tehama County Transit Agency data in rr20_financials__2 is already in BigQuery, not writing.
23-10-02 14:37:15:INFO: Loaded None rows and 46 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_financials__2 for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_service_data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:17:INFO: Found 2 rows in rr20_service_data for Tehama County Transit Agency
23-10-02 14:37:17:INFO: Checking for existing data
23-10-02 14:37:17:INFO: Tehama County Transit Agency data in rr20_service_data is already in BigQuery, not writing.
23-10-02 14:37:17:INFO: Loaded None rows and 10 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_service_data for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_safety


  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:21:INFO: Found 0 rows in rr20_safety for Tehama County Transit Agency
23-10-02 14:37:21:INFO: Did not find existing data in cal-itp-data-infra.blackcat_raw.2023_rr20_safety for Tehama County Transit Agency, loading new raw data.
23-10-02 14:37:24:INFO: Loaded 1 rows and 7 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_safety for Tehama County Transit Agency


Checking data for Tehama County Transit Agency from rr20_other_resources


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incoming_org_data['date_uploaded'] = pd.to_datetime(datetime.datetime.now().date())
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
  incoming_org_data.columns = (incoming_org_data.columns.str.replace(' ', '_')
23-10-02 14:37:28:INFO: Found 1 rows in rr20_other_resources for Tehama County Transit Agency
23-10-02 14:37:28:INFO: Checking for existing data
23-10-02 14:37:28:INFO: Tehama County Transit Agency data in rr20_other_resources is already in BigQuery, not writing.
23-10-02 14:37:28:INFO: Loaded None rows and 6 columns to cal-itp-data-infra.blackcat_raw.2023_rr20_other_resources for Tehama County Transit Agency


### BELOW IS THE FUNCTION DEVELOPMENT FOR THE NON-AIRFLOW WAY OF GETTING THE LATEST FILE

In [11]:
subdir = f"blackcat_ntd_reports_{this_year}_raw"

for file in list(bucket.list_blobs(prefix=subdir)):
    print(file.name)

In [7]:
# There will be an argument for a place to input the specific form that is being checked. 
# For testing here we specify it
form_to_check = "RR-20"
subdir = f"blackcat_ntd_reports_{this_year}_raw"
file_prefix = form_to_file_dict.get(form_to_check) #'NTD_Annual_Report_Rural_2023'
# print(file_prefix)

# Get the most recent file in the raw bucket
all_files=[]
for file in list(bucket.list_blobs(prefix=subdir)):
    if file_prefix in file.name:
        f = file.name.split('/')[1]
        fdate = re.search(r'(\d{4}-\d{2}-\d{2})', f).group()
        all_files.append(fdate)
#         print(all_files)
#         print(file.name)
    else:
        pass

all_files.sort(key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"), reverse=True)
# print(all_files)
latest_date = all_files[0]
latest_file = f"{subdir}/{file_prefix}_{latest_date}.xlsx"
    
        

In [8]:
latest_file

'blackcat_ntd_reports_2023_raw/NTD_Annual_Report_Rural_2023_2023-10-10.xlsx'

In [26]:
# Testing
all_files = ['2023-09-28', '2023-09-29']
all_files.sort(key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"), reverse=True)
all_files

['2023-09-29', '2023-09-28']

In [33]:
# Final function - get the most recent file in the Raw bucket

def get_latest_file(form_to_check, subdir):
    file_prefix = form_to_file_dict.get(form_to_check) 
    all_files=[]
    
    for file in list(bucket.list_blobs(prefix=subdir)):    
        if file_prefix in file.name:
            f = file.name.split('/')[1]
            fdate = re.search(r'(\d{4}-\d{2}-\d{2})', f).group()
            all_files.append(fdate)
            print(all_files)

        else:
            pass

        all_files.sort(key=lambda date: datetime.datetime.strptime(date, "%Y-%m-%d"), reverse=True)
        latest_date = all_files[0]
        latest_file = f"{subdir}/{file_prefix}_{latest_date}.xlsx"
        
    return latest_file
    