## BlackCat Organizations
5/19
* Most recent records
* Transit agencies that have fixed-route services (more than the vendors contracted to  provide dial-a-ride / paratransit services—I recall there being a bunch of those in the  BlackCat export I got last year).
* 5311, SGR then + Clovis

In [1]:
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
import siuba  # need this to do type hint in functions
from calitp_data_analysis.tables import tbls
from siuba import *
import fuzzywuzzy
from fuzzywuzzy import process


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas  # type: ignore


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
gcs_path = "gs://calitp-analytics-data/data-analyses/grant_misc/"

### Blackcat

In [4]:
blackcat = to_snakecase(pd.read_excel(f"{gcs_path}BlackCat_Grants_Projects_5_22_23.xlsx"))

In [5]:
blackcat.sample()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
105,2016,Section 5311,CA-2017-025 | 0017000158,2016,Tehama County Transit Agency,BCG0000191,Operating Assistance,300902,64BO17-00434,338998.0,338998.0,995070.08,1334068.08,338998.0,0.0,0.0,Open,,,


In [6]:
blackcat.shape

(3145, 20)

In [7]:
# Filter grant fiscal year
blackcat2 = blackcat[blackcat.grant_fiscal_year >= 2018].reset_index(drop = True)

In [8]:
blackcat2.shape, blackcat2.grant_fiscal_year.value_counts()

((2265, 20),
 2019    885
 2021    672
 2020    339
 2022    207
 2018    162
 Name: grant_fiscal_year, dtype: int64)

In [9]:
blackcat2.columns

Index(['grant_fiscal_year', 'funding_program', 'grant_number', 'project_year',
       'organization_name', 'upin', 'description', 'ali', 'contract_number',
       'allocationamount', 'grant_encumbered_amount',
       'local_encumbered_amount', 'total_encumbered_amount', 'expendedamount',
       'activebalance', 'closedoutbalance', 'project_status',
       'project_closed_by', 'project_closed_date', 'project_closed_time'],
      dtype='object')

In [10]:
grants_subset = ['5311(f) Cont',  'CMAQ (FTA 5311)',
       'Section 5311', '5311(f) Round 2',
       '5339 (State)', 'Section 5311(f)']

In [11]:
blackcat2 = blackcat2[blackcat2.funding_program.isin(grants_subset)]

In [12]:
# Subset to only organizations
organizations = (blackcat2[['organization_name', 'grant_fiscal_year', 'funding_program']]
                 .sort_values(by = ['organization_name', 'grant_fiscal_year'], ascending = [True, False])
                 .drop_duplicates(subset = ['organization_name','funding_program'])
                 .reset_index(drop = True)
                )

In [13]:
len(organizations)

177

In [14]:

def summarize_rows(df, col_to_group: str, col_to_summarize: str) -> pd.DataFrame:
    """
    Puts all the elements in the column "col to summarize" 
    onto one line and separates them by commas. 
    """
    df = (df
    .groupby(col_to_group)[col_to_summarize]
    .apply(','.join)
    .reset_index()
     )
    return df

In [15]:
organizations = summarize_rows(organizations, ['organization_name','grant_fiscal_year'], 'funding_program')

In [16]:
organizations = (organizations.sort_values(by = ['organization_name', 'grant_fiscal_year'], ascending = [True, False])
              .drop_duplicates(subset = ['organization_name']).reset_index(drop = True))

In [17]:
organizations.shape, organizations.organization_name.nunique()

((92, 3), 92)

In [18]:
# organizations

### State of Good Repair

In [19]:
sgr = to_snakecase(pd.read_excel(f"{gcs_path}SGR Calsmart-user-list request.xls"))

In [20]:
sgr_subset = ['first_name', 'last_name', 'email', 'phone', 'title', 'agency']

In [21]:
sgr2 = sgr[sgr_subset]

In [22]:
sgr2 = sgr2.drop_duplicates('agency').reset_index(drop = True)

In [23]:
len(sgr2), len(sgr)

(195, 474)

In [24]:
sgr2['funding_program'] = 'State of Good Repair'

In [25]:
sgr2.shape, sgr.agency.nunique()

((195, 7), 195)

### Merge BlackCat w/ SGR

In [26]:
def clean_punctuation(df, agency_col: str) -> pd.DataFrame:
    """
    Cleans up agency names. Assume anything after comma/()/
    ; are acronyms and delete them. Correct certain mispellings.
    Change agency names to title case. Clean whitespaces.
    """
    df[agency_col] = (
        df[agency_col]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.split(";")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.replace("*","")
        .str.replace("Agency","")
        .str.strip() #strip whitespaces again after getting rid of certain things
    )
    return df


In [27]:
def flip_county_city(df, agency_col:str):
    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
    to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]
    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)
    to_correct['str_len'] = to_correct[agency_col].str.split().str.len()
    to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)
    to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)
    to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']
    
    new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))
    df['agency_corrected'] = df[agency_col].map(new_names_dictionary)
    df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])
    
    df = df.drop(columns = [agency_col])
    df = df.rename(columns = {"agency_corrected":agency_col})
    
    return df 

In [28]:
def clean_organization_names(df, agency_col:str):
    df = clean_punctuation(df, agency_col)
    df = flip_county_city(df, agency_col)
    return df

In [29]:
organizations = clean_organization_names(organizations, 'organization_name')

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


In [30]:
sgr2 = clean_organization_names(sgr2, "agency")

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


In [31]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [32]:
def find_fuzzy_match(df1, df2, df1_fuzzy_column:str, df2_fuzzy_column:str, new_column:str, min_ratio:int):
    unique_values = df1[df1_fuzzy_column].unique().tolist()
    for i in unique_values:
        replace_matches_set_ratio(df2, df2_fuzzy_column, new_column, i, min_ratio)
    return df2

In [33]:
organizations = find_fuzzy_match(sgr, organizations,'agency',  'organization_name', 'fuzzy_match_agency', 95)

In [34]:
organizations.fuzzy_match_agency = organizations.fuzzy_match_agency.fillna('organization_name')

In [35]:
# organizations = organizations.drop(columns = ['organization_name']).rename(columns = {'fuzzy_match_agency':'organization_name'})

In [36]:
m1 = pd.merge(organizations, sgr2, left_on = ['fuzzy_match_agency'], right_on = ['agency'], how = 'outer', indicator = True)

In [37]:
m1._merge.value_counts()

right_only    151
left_only      47
both           45
Name: _merge, dtype: int64

In [38]:
m1.funding_program_x = m1.funding_program_x.fillna('')

In [39]:
m1.funding_program_y = m1.funding_program_y.fillna(m1.funding_program_x)

In [40]:
m1['funding_program'] = m1.funding_program_x + ',' + m1.funding_program_y

In [41]:
m1.funding_program = m1.funding_program.fillna(m1.funding_program_y)

In [42]:
m1.organization_name = m1.organization_name.fillna(m1.agency)

In [43]:
cols_to_drop = ['funding_program_x','funding_program_y', 'fuzzy_match_agency','agency','_merge','grant_fiscal_year']

In [44]:
m1 = m1.drop(columns = cols_to_drop)

In [89]:
m1.organization_name.nunique(), m1.shape

(210, (243, 8))

In [90]:
m1.columns

Index(['organization_name', 'first_name', 'last_name', 'email', 'phone',
       'title', 'funding_program', 'fuzzy_agency'],
      dtype='object')

In [94]:
m1_test = (summarize_rows(m1, ['organization_name', 'first_name', 'last_name', 'email', 'phone',
       'title'], 'funding_program'))

### Airtable
* Grab only fixed route providers.

In [45]:
airtable = (tbls.external_airtable.california_transit__services() >> collect () )

In [46]:
# airtable.columns

In [47]:
# ['name','service_type', 'service_operator_type', 'operator_organization_type']
airtable_subset = ['name','service_type']

In [48]:
airtable2 = airtable[airtable_subset]

In [49]:
airtable2.name.nunique()

994

In [50]:
airtable2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291093 entries, 0 to 291092
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          291092 non-null  object
 1   service_type  291093 non-null  object
dtypes: object(2)
memory usage: 4.4+ MB


In [51]:
airtable2.sort_values(['name']).head()

Unnamed: 0,name,service_type
3483,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
2637,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
7712,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
106213,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
11119,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"


In [52]:
airtable3 = airtable2.explode('service_type').reset_index(drop = True)

In [53]:
airtable3.head()

Unnamed: 0,name,service_type
0,Topanga Beach Bus,fixed-route
1,St Pauls PACE,NEMT
2,Dodge Ridge Ski Bus,deviated fixed-route
3,Dodge Ridge Ski Bus,reservations
4,SacRT GO,ADA paratransit


In [54]:
airtable3.service_type = airtable3.service_type.fillna('NA')

In [55]:
# fixed route only
airtable4 = airtable3[airtable3.service_type.str.lower().str.contains('fixed')]

In [56]:
len(airtable4)

136457

In [57]:
airtable5 = airtable4.drop_duplicates(['name']).sort_values(by = ['name']).reset_index(drop = True)

In [58]:
len(airtable5)

429

### Merge

In [59]:
airtable5 = clean_organization_names(airtable5, 'name')

  df[agency_col]
  to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)


In [60]:
m1 = find_fuzzy_match(airtable5, m1, 'name', 'organization_name','fuzzy_agency', 95)

In [61]:
# Reverse -> replace 
"""
for i in airtable5.name.unique().tolist():
       replace_matches_set_ratio(
        organizations, "organization_name", "project_name_fuzzy", i, 95) """

'\nfor i in airtable5.name.unique().tolist():\n       replace_matches_set_ratio(\n        organizations, "organization_name", "project_name_fuzzy", i, 95) '

In [62]:
#Fuzzy matches that didn't work
# Use organization name
fuzzy_matches_to_filter = ['Eastern Contra Costa Transit Authority', 'Livermore Amador Valley Transit Authority','Calaveras Transit' ,'City Of Corcoran - Corcoran Area Transit',]

In [63]:
# These are the fuzzy matches that worked.
found_matches = (m1[((~m1.fuzzy_agency.isna()) & ~(m1.fuzzy_agency.isin(fuzzy_matches_to_filter)))]).reset_index(drop = True)

In [64]:
found_matches.shape

(56, 8)

In [65]:
still_need_matches = m1[~m1.organization_name.isin(found_matches.organization_name.tolist())]

* Calaveras Transit change to Calaveras Connect
* Arvin City  Arvin Transit
* Auburn City Auburn Transit
* County Of Los Angeles - Department Of Public Works, Los Angeles County Transit Services
* 34	County Of Sacramento Department Of Transportation Sacrt Bus
* County Of Shasta Department Of Public Works
* Dinuba City Dinuba Connection
* Lassen Transit Service Lassen Rural Bus
* Needles City  Needles Area Transit
* Nevada Public Works County Nevada County Connects
* Ojai City Ojai Trolley
* Palo Verde Valley Transit Palos Verdes Peninsula Transit Authority
* Placer County Public Works Placer County Transit
* Plumas County Transportation Commission Plumas Transit Systems
* Porterville City  Porterville Transit
* Ridgecrest City Ridgecrest Transit
* Rio Vista City Rio Vista Delta Breeze
* Santa Maria City Santa Maria Regional Transit
* Siskiyou County Siskiyou Transit And General Express
* Stanislaus County Public Works - Transit Division  Stanislaus Regional Transit Authority
* Taft City Taft Area Transit
* Tehama County Transit  Tehama Rural Area Express
* Transportation Trinity County Department Trinity Transit
* Transit Joint Powers Authority For Merced County Merced The Bus
* Visalia City Visilia Transit
* Yolo County Transportation District Yolobus


In [66]:
to_map = {'Calaveras Transit':'Calaveras Connect',
'Arvin City': 'Arvin Transit',
'Auburn City': 'Auburn Transit',
'County Of Los Angeles - Department Of Public Works': 'Los Angeles County Transit Services',
'County Of Sacramento Department Of Transportation' :'Sacrt Bus',
'Dinuba City': 'Dinuba Connection',
'Lassen Transit Service' :'Lassen Rural Bus',
'Needles City': 'Needles Area Transit',
'Nevada Public Works' :'County Nevada County Connects',
'Ojai City' :'Ojai Trolley',
'Palo Verde Valley Transit': 'Palos Verdes Peninsula Transit Authority',
'Placer County Public Works': 'Placer County Transit',
'Plumas County Transportation Commission' :'Plumas Transit Systems',
'Porterville City' :'Porterville Transit',
'Ridgecrest City': 'Ridgecrest Transit',
'Rio Vista City' :'Rio Vista Delta Breeze',
'Santa Maria City' :'Santa Maria Regional Transit',
'Siskiyou County': 'Siskiyou Transit And General Express',
'Stanislaus County Public Works - Transit Division' :'Stanislaus Regional Transit Authority',
'Taft City': 'Taft Area Transit',
'Tehama County Transit'  :'Tehama Rural Area Express',
'Transportation Trinity County Department' :'Trinity Transit',
'Transit Joint Powers Authority For Merced' :'County Merced The Bus',
'Visalia City': 'Visilia Transit',
'Yolo County Transportation District' :'Yolobus'}

In [67]:
still_need_matches.organization_name = still_need_matches.organization_name.replace(to_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  still_need_matches.organization_name = still_need_matches.organization_name.replace(to_map)


In [69]:
# Concat
blackcat_cleaned = pd.concat([found_matches, still_need_matches], axis=0)

In [71]:
len(m1)

243

In [72]:
len(blackcat_cleaned)

243

In [73]:
m2 = pd.merge(blackcat_cleaned, airtable5, how = 'left', left_on = 'organization_name', right_on = 'name')

In [74]:
m2.service_type = m2.service_type.fillna('no service info')

In [75]:
m2 = m2.drop(columns = ['name'])

In [76]:
m2.service_type.value_counts()

no service info         180
fixed-route              53
deviated fixed-route     11
Name: service_type, dtype: int64

In [77]:
m2 = m2.fillna('NA')

In [78]:
final_subset = ['funding_program', 'organization_name','first_name', 'last_name', 'email', 'phone', 'title', 
      'service_type']

In [79]:
m2 = m2[final_subset]

In [80]:
def delete_repeated_element(df, col:str):
    df[col] = (df[col].apply(lambda x: ", ".join(set([y.strip() for y in x.split(",")]))).str.strip())
    return df

In [82]:
m2.service_type =m2.service_type.str.title()

In [83]:
m2 = delete_repeated_element(m2, "funding_program")

In [85]:
m2 = m2.sort_values(['organization_name']).reset_index(drop = True)

In [86]:
m2

Unnamed: 0,funding_program,organization_name,first_name,last_name,email,phone,title,service_type
0,", State of Good Repair",Alameda-Contra Costa Transit District,Eve,Ng,grants@actransit.org,5108915405.0,Capital Planning and Grants Manager,No Service Info
1,"State of Good Repair, Section 5311",Alpine County Community Development,Ethan,Gray,egray@alpinecountyca.gov,5306942140.0,Community Development Deputy Director,No Service Info
2,", State of Good Repair",Alpine County Local Transportation Commission,Scott,Maas,smaas@citlink.net,5302600991.0,Transportation Program Manager,No Service Info
3,"State of Good Repair, Section 5311, 5339 (State)",Amador Transit,Jennifer,Yeamans,jyeamans@lavta.org,9254557561.0,Sr Grants & Management Specialist,Fixed-Route
4,", State of Good Repair",Amador Transit,Patricia,Amarant,maggie@amadortransit.com,2092675079.0,General Manager,Fixed-Route
5,", State of Good Repair",Antelope Valley Transit Authority,Judy,Fry,jfry@avta.com,6617292234.0,Chief Financial Officer,No Service Info
6,", State of Good Repair",Arcadia City,Jayme,Admin,supercali707@gmail.com,7076854324.0,Admin Tester,No Service Info
7,", State of Good Repair",Arcata City,Marcela,Jimenez,mjimenez@cityofarcata.org,5107349099.0,Engineering Aide,No Service Info
8,Section 5311,Arcata City,,,,,,No Service Info
9,", State of Good Repair",Arvin Transit,Jeff,Jones,jeffjones@arvin.org,6618543134.0,Finance Director,Fixed-Route
