## Overlaps between organizations in TIRCP/Black Cat in MA

In [151]:
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 300
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from calitp import *
from shared_utils import portfolio_utils

In [152]:
# import chardet
import fuzzywuzzy
from fuzzywuzzy import process

#### Load in TIRCP

In [153]:
# TIRCP spreadsheet: last updated November 3
tircp = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx",
        sheet_name="main",
    )
)

In [154]:
# Subset to relevant cols
# Drop grant_recipient duplicates
tircp2 = tircp[["grant_recipient", "district"]].drop_duplicates(
    subset=["grant_recipient"]
)

In [155]:
f"{len(tircp2)} grant recipients"

'49 grant recipients'

In [156]:
# Cast to list
tircp_grant_recipient_list = tircp2.grant_recipient.tolist()

#### Load in Black Cat

In [157]:
# Read in BC
blackcat = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx"
    )
)

In [158]:
# Grab only 5311/5310 programs
programs_5311_5310 = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "5339 (National)",
    "5339 (State)",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [159]:
# Keep only 5311/5310 recipients
blackcat2 = blackcat[blackcat["funding_program"].isin(programs_5311_5310)]

In [160]:
# Drop duplicates and keep only organization info
blackcat2 = blackcat2[["organization_name"]].drop_duplicates()

In [161]:
# Function to clean agency/organization names
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip()  # strip again after getting rid of certain things
    )
    return df

In [162]:
# Only keep the name of the agencies, not its acronym
blackcat3 = organization_cleaning(blackcat2, "organization_name")

In [163]:
blackcat3.organization_name.nunique()

211

In [164]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # get a list of unique strings
    strings = df[column].unique()

    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [165]:
# For Blackcat agencies, use fuzzy matching to find matches against TIRCP agencies with a threshold of 92
for i in tircp_grant_recipient_list:
    replace_matches_in_column(
        blackcat3, "organization_name", "organization_BC_fuzzy_matching", i, 92
    )

In [166]:
# Crosswalk for other value(s) that are the same agency
crosswalk_tircp_bc = {
    "Tulare County Regional Transportation Agency": "Tulare County Regional Transit Agency"
}

In [167]:
# Replace crosswalk
blackcat3["organization_name"] = blackcat3["organization_name"].replace(
    crosswalk_tircp_bc
)

In [168]:
# Fill in NAN values in new organization_BC_fuzzy_matching column created above with 
# values from original organization name column
blackcat3.organization_BC_fuzzy_matching = (
    blackcat3.organization_BC_fuzzy_matching.fillna(blackcat3["organization_name"])
)

In [169]:
blackcat3 = blackcat3.drop(columns=["organization_name"])

#### Merge the 2 together

In [170]:
merge1 = pd.merge(
    blackcat3,
    tircp2,
    how="outer",
    left_on=["organization_BC_fuzzy_matching"],
    right_on=["grant_recipient"],
    indicator=True,
)

In [171]:
merge1._merge.value_counts()

left_only     197
right_only     35
both           14
Name: _merge, dtype: int64

In [172]:
merge1 = merge1.rename(
    columns={
        "grant_recipient": "TIRCP_Orgs",
        "organization_BC_fuzzy_matching": "BlackCat_Orgs",
        "_merge": "BC_TIRCP_merge",
    }
)

In [173]:
def progress(df):
    if df["BC_TIRCP_merge"] == "left_only":
        return "Black Cat Only"
    elif df["BC_TIRCP_merge"] == "right_only":
        return "TIRCP Only"
    else:
        return "Found in both TIRCP and BlackCat"

In [174]:
merge1["BC_TIRCP_merge"] = merge1.apply(progress, axis=1)

In [175]:
# merge1.sort_values(by = ['BlackCat_Orgs', 'TIRCP_Orgs'],ascending = True)

In [176]:
# merge1[['TIRCP_Orgs', 'BC_TIRCP_merge']].dropna().sort_values('TIRCP_Orgs')

In [177]:
# merge1[['BlackCat_Orgs','BC_TIRCP_merge']].dropna().sort_values('BlackCat_Orgs')

#### Add district information for BlackCat agencies

In [178]:
agency_district_df = portfolio_utils.add_caltrans_district()

##### Add agency names/cal itp id first 


In [179]:
agency_calitp_df = portfolio_utils.add_agency_name()

In [180]:
# Clean org names
agency_calitp_df = organization_cleaning(agency_calitp_df, "calitp_agency_name")

In [181]:
# agency_calitp_df.calitp_agency_name.sort_values().unique().tolist()

In [182]:
# Grab grant recipients from blackcat
blackcat_agencies = merge1.BlackCat_Orgs.dropna().tolist()

In [183]:
# Using fuzzy matching, preview the agency names in agency_calitp_df that might match black cat agencies
for i in blackcat_agencies:
    replace_matches_in_column(
        agency_calitp_df,
        "calitp_agency_name",
        "organization_calitp_fuzzy_matching",
        i,
        90,
    )

In [184]:
# agency_calitp_df[['calitp_agency_name', 'organization_calitp_fuzzy_matching']].dropna()

In [185]:
# Change some of the calitp_agency values
crosswalk_calitp_bc = {
    "Glenn County Transit": "Glenn County Transportation Commission",
    "Kern Transit": " Kern Regional Transit",
    "Lake Transit": "Lake Transit Authority",
    "Santa Rosa Citybus": "City Of Santa Rosa",
    "Tulare County Area Transit": "Tulare County Regional Transit Agency",
    "Victor Valley Transit": "Victor Valley Transit Authority",
    "Yolobus":"Yolo County Transportation District"
}

In [186]:
# Replace crosswalk
agency_calitp_df["calitp_agency_name"] = agency_calitp_df["calitp_agency_name"].replace(
    crosswalk_calitp_bc
)

In [187]:
merge2 = pd.merge(
    merge1,
    agency_calitp_df,
    how="left",
    left_on=["BlackCat_Orgs"],
    right_on=["calitp_agency_name"],
    indicator=True,
)

In [188]:
merge2._merge.value_counts()

left_only     216
both           31
right_only      0
Name: _merge, dtype: int64

In [189]:
# Drop merge indicator
merge2 = merge2.drop(columns = ["_merge"])

##### Add districts based on Cal ITP ID

In [190]:
merge3 = pd.merge(
    merge2,
    agency_district_df,
    how="left",
    left_on=["calitp_itp_id"],
    right_on=["calitp_itp_id"],
)

In [191]:
# For NaN values in caltrans_district col, fill in with district info from TIRCP
merge3.caltrans_district= (
    merge3.caltrans_district.fillna(merge3["district"])
)

In [192]:
# Drop cols
merge4 = merge3.drop(columns =['organization_calitp_fuzzy_matching', 'calitp_agency_name', "district" ])

In [194]:
merge4.caltrans_district.isna().sum()

174

In [197]:
merge4

Unnamed: 0,BlackCat_Orgs,TIRCP_Orgs,BC_TIRCP_merge,calitp_itp_id,caltrans_district
0,City Of Chowchilla,,Black Cat Only,,
1,Madera County,,Black Cat Only,,
2,Fresno County Rural Transit Agency,Fresno County Rural Transit Agency,Found in both TIRCP and BlackCat,117.0,06 - Fresno
3,Yuba-Sutter Transit Authority,,Black Cat Only,376.0,03 - Marysville
4,City Of Arcata,,Black Cat Only,,
5,City Of Dinuba,,Black Cat Only,,
6,City Of Ojai,,Black Cat Only,,
7,Modoc Transportation Agency,,Black Cat Only,,
8,San Diego Metropolitan Transit System,San Diego Metropolitan Transit System,Found in both TIRCP and BlackCat,278.0,11 - San Diego
9,Humboldt Transit Authority,Humboldt Transit Authority,Found in both TIRCP and BlackCat,135.0,01 - Eureka


In [195]:
#merge3.to_parquet('gs://calitp-analytics-data/data-analyses/grants/BlackCat_TIRCP_Dec_2022.parquet')