## BlackCat Organizations
5/19
* Most recent records
* Transit agencies that have fixed-route services (more than the vendors contracted to  provide dial-a-ride / paratransit services—I recall there being a bunch of those in the  BlackCat export I got last year).
* 5311, SGR then + Clovis

In [33]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import siuba  # need this to do type hint in functions
from calitp_data_analysis.tables import tbls
from siuba import *


In [34]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [35]:
gcs_path = "gs://calitp-analytics-data/data-analyses/grant_misc/"

In [36]:
blackcat = to_snakecase(pd.read_excel(f"{gcs_path}BlackCat_Grants_Projects_5_22_23.xlsx"))

In [37]:
blackcat.sample()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
1111,2019,5310 Exp,CA-2020-244 | 0020000273-E,2019,The Center for Independent Living,BCG0002191,Mobility Management,117L00,64AM19-01237,60000.0,300000.0,0.0,300000.0,0.0,60000.0,0.0,Open,,,


In [38]:
blackcat.shape

(3145, 20)

In [39]:
# Filter grant fiscal year
blackcat2 = blackcat[blackcat.grant_fiscal_year >= 2018].reset_index(drop = True)

In [40]:
blackcat2.shape, blackcat2.grant_fiscal_year.value_counts()

((2265, 20),
 2019    885
 2021    672
 2020    339
 2022    207
 2018    162
 Name: grant_fiscal_year, dtype: int64)

In [116]:
blackcat2.columns

Index(['grant_fiscal_year', 'funding_program', 'grant_number', 'project_year',
       'organization_name', 'upin', 'description', 'ali', 'contract_number',
       'allocationamount', 'grant_encumbered_amount',
       'local_encumbered_amount', 'total_encumbered_amount', 'expendedamount',
       'activebalance', 'closedoutbalance', 'project_status',
       'project_closed_by', 'project_closed_date', 'project_closed_time'],
      dtype='object')

In [162]:
grants_subset = ['5311(f) Cont',  'CMAQ (FTA 5311)',
       'Section 5311', '5311(f) Round 2',
       '5339 (State)', 'Section 5311(f)']

In [163]:
blackcat2 = blackcat2[blackcat2.funding_program.isin(grants_subset)]

In [164]:
# Subset to only organizations
organizations = blackcat2[['organization_name']].sort_values(by = ['organization_name']).drop_duplicates().reset_index(drop = True)

In [165]:
len(organizations)

92

### Airtable
* Grab only fixed route providers.

In [50]:
airtable = (tbls.external_airtable.california_transit__services() >> collect () )

In [54]:
# airtable.columns

In [145]:
# ['name','service_type', 'service_operator_type', 'operator_organization_type']
airtable_subset = ['name','service_type']

In [146]:
airtable2 = airtable[airtable_subset]

In [147]:
airtable2.name.nunique()

994

In [148]:
airtable2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290450 entries, 0 to 290449
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          290449 non-null  object
 1   service_type  290450 non-null  object
dtypes: object(2)
memory usage: 4.4+ MB


In [149]:
airtable2.sort_values(['name']).head()

Unnamed: 0,name,service_type
14497,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
107385,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
9415,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
22961,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
185196,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"


In [150]:
airtable3 = airtable2.explode('service_type').reset_index(drop = True)

In [151]:
airtable3.head()

Unnamed: 0,name,service_type
0,Topanga Beach Bus,fixed-route
1,St Pauls PACE,NEMT
2,Dodge Ridge Ski Bus,deviated fixed-route
3,Dodge Ridge Ski Bus,reservations
4,SacRT GO,ADA paratransit


In [152]:
airtable3.service_type = airtable3.service_type.fillna('NA')

In [153]:
# fixed route only
airtable4 = airtable3[airtable3.service_type.str.lower().str.contains('fixed')]

In [154]:
len(airtable4)

136164

In [155]:
airtable5 = airtable4.drop_duplicates(['name']).sort_values(by = ['name']).reset_index(drop = True)

### Merge

In [156]:
def clean_punctuation(df, agency_col: str) -> pd.DataFrame:
    """
    Cleans up agency names. Assume anything after comma/()/
    ; are acronyms and delete them. Correct certain mispellings.
    Change agency names to title case. Clean whitespaces.
    """
    df[agency_col] = (
        df[agency_col]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.split(";")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.replace("*","")
        .str.replace("Agency","")
        .str.strip() #strip whitespaces again after getting rid of certain things
    )
    return df


In [157]:
def flip_county_city(df, agency_col:str):
    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
    to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]
    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)
    to_correct['str_len'] = to_correct[agency_col].str.split().str.len()
    to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)
    to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)
    to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']
    
    new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))
    df['agency_corrected'] = df[agency_col].map(new_names_dictionary)
    df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])
    
    df = df.drop(columns = [agency_col])
    df = df.rename(columns = {"agency_corrected":agency_col})
    
    return df 

In [167]:
def clean_organization_names(df, agency_col:str):
    df = organization_cleaning(df, agency_col)
    df = flip_county_city(df, agency_col)
    return df

In [168]:
airtable5 = clean_organization_names(airtable5, 'name')

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


ValueError: Columns must be same length as key

In [169]:
organizations.columns

Index(['organization_name'], dtype='object')

In [170]:
organizations = clean_organization_names(organizations, 'organization_name')

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


In [171]:
m1 = pd.merge(organizations, airtable5, how = 'outer', left_on = 'organization_name', right_on = 'name', indicator = True)

In [172]:
m1._merge.value_counts()

right_only    414
left_only      77
both           15
Name: _merge, dtype: int64

In [173]:
m1

Unnamed: 0,organization_name,service_type,name,_merge
0,Alpine County Community Development,,,left_only
1,Amador Transit,fixed-route,Amador Transit,both
2,Butte County Association Of Governments Butte Regional Transit,,,left_only
3,Calaveras County Public Works,,,left_only
4,Calaveras Transit Agency,,,left_only
5,Arcata City,,,left_only
6,Arvin City,,,left_only
7,Auburn City,,,left_only
8,California City City,,,left_only
9,Chowchilla City,,,left_only
