## BlackCat Organizations
5/19
* Most recent records
* Transit agencies that have fixed-route services (more than the vendors contracted to  provide dial-a-ride / paratransit services—I recall there being a bunch of those in the  BlackCat export I got last year).
* 5311, SGR then + Clovis

In [2]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import siuba  # need this to do type hint in functions
from calitp_data_analysis.tables import tbls
from siuba import *
import fuzzywuzzy
from fuzzywuzzy import process


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas  # type: ignore


In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
gcs_path = "gs://calitp-analytics-data/data-analyses/grant_misc/"

In [5]:
blackcat = to_snakecase(pd.read_excel(f"{gcs_path}BlackCat_Grants_Projects_5_22_23.xlsx"))

In [6]:
blackcat.sample()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
1031,2018,Section 5311,CA-2018-114 | 0019000012,2018,Yuba-Sutter Transit Authority,BCG0001250,Operating Assistance Sliding Scale,300902,64BO19-00828,200000.0,200000.0,161475.0,361475.0,200000.0,0.0,0.0,Open,,,


In [7]:
blackcat.shape

(3145, 20)

In [8]:
# Filter grant fiscal year
blackcat2 = blackcat[blackcat.grant_fiscal_year >= 2018].reset_index(drop = True)

In [9]:
blackcat2.shape, blackcat2.grant_fiscal_year.value_counts()

((2265, 20),
 2019    885
 2021    672
 2020    339
 2022    207
 2018    162
 Name: grant_fiscal_year, dtype: int64)

In [10]:
blackcat2.columns

Index(['grant_fiscal_year', 'funding_program', 'grant_number', 'project_year',
       'organization_name', 'upin', 'description', 'ali', 'contract_number',
       'allocationamount', 'grant_encumbered_amount',
       'local_encumbered_amount', 'total_encumbered_amount', 'expendedamount',
       'activebalance', 'closedoutbalance', 'project_status',
       'project_closed_by', 'project_closed_date', 'project_closed_time'],
      dtype='object')

In [11]:
grants_subset = ['5311(f) Cont',  'CMAQ (FTA 5311)',
       'Section 5311', '5311(f) Round 2',
       '5339 (State)', 'Section 5311(f)']

In [12]:
blackcat2 = blackcat2[blackcat2.funding_program.isin(grants_subset)]

In [13]:
# Subset to only organizations
organizations = blackcat2[['organization_name']].sort_values(by = ['organization_name']).drop_duplicates().reset_index(drop = True)

In [14]:
len(organizations)

92

### Airtable
* Grab only fixed route providers.

In [15]:
airtable = (tbls.external_airtable.california_transit__services() >> collect () )

In [16]:
# airtable.columns

In [17]:
# ['name','service_type', 'service_operator_type', 'operator_organization_type']
airtable_subset = ['name','service_type']

In [18]:
airtable2 = airtable[airtable_subset]

In [19]:
airtable2.name.nunique()

994

In [20]:
airtable2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290450 entries, 0 to 290449
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          290449 non-null  object
 1   service_type  290450 non-null  object
dtypes: object(2)
memory usage: 4.4+ MB


In [21]:
airtable2.sort_values(['name']).head()

Unnamed: 0,name,service_type
14497,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
107385,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
9415,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
22961,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
185196,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"


In [22]:
airtable3 = airtable2.explode('service_type').reset_index(drop = True)

In [23]:
airtable3.head()

Unnamed: 0,name,service_type
0,Topanga Beach Bus,fixed-route
1,St Pauls PACE,NEMT
2,Dodge Ridge Ski Bus,deviated fixed-route
3,Dodge Ridge Ski Bus,reservations
4,SacRT GO,ADA paratransit


In [24]:
airtable3.service_type = airtable3.service_type.fillna('NA')

In [25]:
# fixed route only
airtable4 = airtable3[airtable3.service_type.str.lower().str.contains('fixed')]

In [26]:
len(airtable4)

136164

In [27]:
airtable5 = airtable4.drop_duplicates(['name']).sort_values(by = ['name']).reset_index(drop = True)

### Merge

In [28]:
def clean_punctuation(df, agency_col: str) -> pd.DataFrame:
    """
    Cleans up agency names. Assume anything after comma/()/
    ; are acronyms and delete them. Correct certain mispellings.
    Change agency names to title case. Clean whitespaces.
    """
    df[agency_col] = (
        df[agency_col]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.split(";")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.replace("*","")
        .str.replace("Agency","")
        .str.strip() #strip whitespaces again after getting rid of certain things
    )
    return df


In [29]:
def flip_county_city(df, agency_col:str):
    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
    to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]
    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)
    to_correct['str_len'] = to_correct[agency_col].str.split().str.len()
    to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)
    to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)
    to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']
    
    new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))
    df['agency_corrected'] = df[agency_col].map(new_names_dictionary)
    df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])
    
    df = df.drop(columns = [agency_col])
    df = df.rename(columns = {"agency_corrected":agency_col})
    
    return df 

In [30]:
def clean_organization_names(df, agency_col:str):
    df = clean_punctuation(df, agency_col)
    df = flip_county_city(df, agency_col)
    return df

In [31]:
airtable5 = clean_organization_names(airtable5, 'name')

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


In [32]:
organizations = clean_organization_names(organizations, 'organization_name')

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


In [33]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [42]:
def find_fuzzy_match(df1, df2, df1_fuzzy_column:str, df2_fuzzy_column:str, new_column:str, min_ratio:int):
    unique_values = df1[df1_fuzzy_column].unique().tolist()
    for i in unique_values:
        replace_matches_set_ratio(df2, df2_fuzzy_column, new_column, i, min_ratio)
    return df2

In [46]:
organizations = find_fuzzy_match(airtable5, organizations, 'name', 'organization_name','fuzzy_agency', 95)

In [88]:
# Reverse -> replace 
"""
for i in airtable5.name.unique().tolist():
       replace_matches_set_ratio(
        organizations, "organization_name", "project_name_fuzzy", i, 95) """

'\nfor i in airtable5.name.unique().tolist():\n       replace_matches_set_ratio(\n        organizations, "organization_name", "project_name_fuzzy", i, 95) '

In [89]:
#Fuzzy matches that didn't work
# Use organization name
fuzzy_matches_to_filter = ['Eastern Contra Costa Transit Authority', 'Livermore Amador Valley Transit Authority','Calaveras Transit' ,'City Of Corcoran - Corcoran Area Transit',]

In [90]:
# These are the fuzzy matches that worked.
found_matches = (organizations[((~organizations.fuzzy_agency.isna()) & ~(organizations.fuzzy_agency.isin(fuzzy_matches_to_filter)))]).reset_index(drop = True)

In [91]:
found_matches.shape

(53, 2)

In [92]:
still_need_matches = organizations[~organizations.organization_name.isin(found_matches.organization_name.tolist())][['organization_name']]

* Calaveras Transit change to Calaveras Connect
* Arvin City  Arvin Transit
* Auburn City Auburn Transit
* County Of Los Angeles - Department Of Public Works, Los Angeles County Transit Services
* 34	County Of Sacramento Department Of Transportation Sacrt Bus
* County Of Shasta Department Of Public Works
* Dinuba City Dinuba Connection
* Lassen Transit Service Lassen Rural Bus
* Needles City  Needles Area Transit
* Nevada Public Works County Nevada County Connects
* Ojai City Ojai Trolley
* Palo Verde Valley Transit Palos Verdes Peninsula Transit Authority
* Placer County Public Works Placer County Transit
* Plumas County Transportation Commission Plumas Transit Systems
* Porterville City  Porterville Transit
* Ridgecrest City Ridgecrest Transit
* Rio Vista City Rio Vista Delta Breeze
* Santa Maria City Santa Maria Regional Transit
* Siskiyou County Siskiyou Transit And General Express
* Stanislaus County Public Works - Transit Division  Stanislaus Regional Transit Authority
* Taft City Taft Area Transit
* Tehama County Transit  Tehama Rural Area Express
* Transportation Trinity County Department Trinity Transit
* Transit Joint Powers Authority For Merced County Merced The Bus
* Visalia City Visilia Transit
* Yolo County Transportation District Yolobus


In [93]:
to_map = {'Calaveras Transit':'Calaveras Connect',
'Arvin City': 'Arvin Transit',
'Auburn City': 'Auburn Transit',
'County Of Los Angeles - Department Of Public Works': 'Los Angeles County Transit Services',
'County Of Sacramento Department Of Transportation' :'Sacrt Bus',
'Dinuba City': 'Dinuba Connection',
'Lassen Transit Service' :'Lassen Rural Bus',
'Needles City': 'Needles Area Transit',
'Nevada Public Works' :'County Nevada County Connects',
'Ojai City' :'Ojai Trolley',
'Palo Verde Valley Transit': 'Palos Verdes Peninsula Transit Authority',
'Placer County Public Works': 'Placer County Transit',
'Plumas County Transportation Commission' :'Plumas Transit Systems',
'Porterville City' :'Porterville Transit',
'Ridgecrest City': 'Ridgecrest Transit',
'Rio Vista City' :'Rio Vista Delta Breeze',
'Santa Maria City' :'Santa Maria Regional Transit',
'Siskiyou County': 'Siskiyou Transit And General Express',
'Stanislaus County Public Works - Transit Division' :'Stanislaus Regional Transit Authority',
'Taft City': 'Taft Area Transit',
'Tehama County Transit'  :'Tehama Rural Area Express',
'Transportation Trinity County Department' :'Trinity Transit',
'Transit Joint Powers Authority For Merced' :'County Merced The Bus',
'Visalia City': 'Visilia Transit',
'Yolo County Transportation District' :'Yolobus'}

In [94]:
still_need_matches.organization_name = still_need_matches.organization_name.replace(to_map)

In [95]:
found_matches = found_matches.drop(columns = ['organization_name']).rename(columns = {'fuzzy_agency':'organization_name'}) 

In [97]:
# Concat
blackcat_cleaned = pd.concat([found_matches, still_need_matches], axis=0)

In [98]:
len(blackcat_cleaned)

92

In [99]:
m1 = pd.merge(blackcat_cleaned, airtable5, how = 'left', left_on = 'organization_name', right_on = 'name')

In [100]:
m1.service_type = m1.service_type.fillna('No Service Info')

In [101]:
m1 = m1.drop(columns = ['name'])

In [103]:
m1.service_type.value_counts()

fixed-route             56
No Service Info         26
deviated fixed-route    10
Name: service_type, dtype: int64