## Overlaps between organizations in TIRCP/Black Cat in MA

In [1]:
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 300
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from calitp import *

In [2]:
# import chardet
import fuzzywuzzy
from fuzzywuzzy import process



#### Load in TIRCP

In [32]:
# TIRCP spreadsheet: last updated November 3
tircp = to_snakecase(pd.read_excel("gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx", sheet_name="main"))

In [33]:
# Subset to relevant cols
# Drop grant_recipient duplicates
tircp2 = tircp[['grant_recipient','district']].drop_duplicates(subset = ['grant_recipient'])

In [60]:
f"{len(tircp2)} grant recipients"

'49 grant recipients'

In [35]:
# Cast to list 
tircp_grant_recipient_list = tircp2.grant_recipient.tolist()

#### Load in Black Cat

In [66]:
# Read in BC 
blackcat = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx"
    )
)

In [67]:
programs_5311_5310 = ['Section 5311', '5310 Exp', '5310 Trad', '5311(f) Cont',
       '5339 (National)', '5339 (State)', 'CMAQ (FTA 5311)',
       'Section 5311(f)', '5311(f) Round 2', ]

In [68]:
# Keep only 5311/5310 recipients
blackcat2 = (
   blackcat[blackcat["funding_program"].isin(programs_5311_5310)]
)

In [69]:
# Drop duplicates, keep only organization info
blackcat2 = (
    blackcat2[["organization_name"]].drop_duplicates()
)

In [70]:
# Function to clean agency/organization names 
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",") 
        .str[0]
        .str.replace("/", "") 
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip() #strip again after getting rid of certain things
    )
    return df

In [71]:
#only keep the name of the agencies, not its acronym
blackcat3 = organization_cleaning(blackcat2, "organization_name")

In [72]:
blackcat3.organization_name.nunique()

211

In [73]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, new_col_name] = string_to_match
     
    

In [74]:
for i in tircp_grant_recipient_list:
    replace_matches_in_column(blackcat3, 'organization_name', 'organization_BC_fuzzy_matching', i, 92)

In [75]:
crosswalk_tircp_bc = {
    'Tulare County Regional Transportation Agency': 'Tulare County Regional Transit Agency'}

In [76]:
blackcat3["organization_name"] = blackcat3["organization_name"].replace(
        crosswalk_tircp_bc
    )


In [77]:
blackcat3.organization_BC_fuzzy_matching = blackcat3.organization_BC_fuzzy_matching.fillna(blackcat3["organization_name"])

In [78]:
blackcat3 = blackcat3.drop(columns = ["organization_name"])

#### Merge the 2 together

In [79]:
merge1 = pd.merge(
    blackcat3,
    tircp2,
    how="outer",
    left_on=["organization_BC_fuzzy_matching"],
    right_on=["grant_recipient"],
    indicator=True,
)

In [80]:
merge1._merge.value_counts()

left_only     197
right_only     35
both           14
Name: _merge, dtype: int64

In [81]:
merge1 = merge1.rename(
    columns={"grant_recipient": "TIRCP_Orgs", "organization_BC_fuzzy_matching": "BlackCat_Orgs",
            '_merge':'BC_TIRCP_merge'}
)

In [82]:
def progress(df):   
    if (df['BC_TIRCP_merge'] == 'left_only'):
        return 'Black Cat Only'
    elif (df['BC_TIRCP_merge'] == 'right_only'):
        return 'TIRCP Only'
    else: 
        return "Found in both TIRCP and BlackCat"

In [83]:
merge1['BC_TIRCP_merge'] = merge1.apply(progress, axis = 1)

In [91]:
merge1.sort_values('BC_TIRCP_merge')

Unnamed: 0,BlackCat_Orgs,TIRCP_Orgs,district,BC_TIRCP_merge
0,City Of Chowchilla,,,Black Cat Only
133,Common Ground Senior Services,,,Black Cat Only
134,Contra Costa Arc,,,Black Cat Only
135,Desert Arc,,,Black Cat Only
136,East Bay Services To The Developmentally Disabled,,,Black Cat Only
137,Easy Lift Transportation,,,Black Cat Only
138,Family Bridges,,,Black Cat Only
139,Friends Of Adult Day Health Care Centers,,,Black Cat Only
140,Full Access & Coordinated Transportation,,,Black Cat Only
141,Futures Explored,,,Black Cat Only


In [84]:
# merge1.sort_values(by = ['BlackCat_Orgs', 'TIRCP_Orgs'],ascending = True)

In [85]:
# merge1[['TIRCP_Orgs', 'BC_TIRCP_merge']].dropna().sort_values('TIRCP_Orgs')

In [86]:
# merge1[['BlackCat_Orgs','BC_TIRCP_merge']].dropna().sort_values('BlackCat_Orgs')

In [92]:
merge1.to_parquet('gs://calitp-analytics-data/data-analyses/grants/BlackCat_TIRCP_Dec_2022.parquet')