## Overlaps between organizations in TIRCP/Black Cat in MA

In [None]:
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 300
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from calitp_data_analysis.sql import to_snakecase
from shared_utils import portfolio_utils



In [None]:
# import chardet
import fuzzywuzzy
from fuzzywuzzy import process

#### Load in TIRCP

In [None]:
# TIRCP spreadsheet: last updated November 3
tircp = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx",
        sheet_name="main",
    )
)

In [None]:
# Subset to relevant cols
# Drop grant_recipient duplicates
tircp2 = tircp[["grant_recipient", "district"]].drop_duplicates(
    subset=["grant_recipient"]
)

In [None]:
f"{len(tircp2)} grant recipients"

In [None]:
# Cast to list
tircp_grant_recipient_list = tircp2.grant_recipient.tolist()

#### Load in Black Cat

In [None]:
# Read in BC
blackcat = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx"
    )
)

In [None]:
# Grab only 5311/5310 programs
programs_5311_5310 = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "5339 (National)",
    "5339 (State)",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [None]:
# Keep only 5311/5310 recipients
blackcat2 = blackcat[blackcat["funding_program"].isin(programs_5311_5310)]

In [None]:
# Drop duplicates and keep only organization info
blackcat2 = blackcat2[["organization_name"]].drop_duplicates()

In [None]:
# Function to clean agency/organization names
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip()  # strip again after getting rid of certain things
    )
    return df

In [None]:
# Only keep the name of the agencies, not its acronym
blackcat3 = organization_cleaning(blackcat2, "organization_name")

In [None]:
blackcat3.organization_name.nunique()

In [None]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # get a list of unique strings
    strings = df[column].unique()

    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [None]:
# For Blackcat agencies, use fuzzy matching to find matches against TIRCP agencies with a threshold of 92
for i in tircp_grant_recipient_list:
    replace_matches_in_column(
        blackcat3, "organization_name", "organization_BC_fuzzy_matching", i, 92
    )

In [None]:
# Crosswalk for other value(s) that are the same agency
crosswalk_tircp_bc = {
    "Tulare County Regional Transportation Agency": "Tulare County Regional Transit Agency"
}

In [None]:
# Replace crosswalk
blackcat3["organization_name"] = blackcat3["organization_name"].replace(
    crosswalk_tircp_bc
)

In [None]:
# Fill in NAN values in new organization_BC_fuzzy_matching column created above with 
# values from original organization name column
blackcat3.organization_BC_fuzzy_matching = (
    blackcat3.organization_BC_fuzzy_matching.fillna(blackcat3["organization_name"])
)

In [None]:
blackcat3 = blackcat3.drop(columns=["organization_name"])

#### Merge the 2 together

In [None]:
merge1 = pd.merge(
    blackcat3,
    tircp2,
    how="outer",
    left_on=["organization_BC_fuzzy_matching"],
    right_on=["grant_recipient"],
    indicator=True,
)

In [None]:
merge1._merge.value_counts()

In [None]:
merge1 = merge1.rename(
    columns={
        "grant_recipient": "TIRCP_Orgs",
        "organization_BC_fuzzy_matching": "BlackCat_Orgs",
        "_merge": "BC_TIRCP_merge",
    }
)

In [None]:
def progress(df):
    if df["BC_TIRCP_merge"] == "left_only":
        return "Black Cat Only"
    elif df["BC_TIRCP_merge"] == "right_only":
        return "TIRCP Only"
    else:
        return "Found in both TIRCP and BlackCat"

In [None]:
merge1["BC_TIRCP_merge"] = merge1.apply(progress, axis=1)

In [None]:
# merge1.sort_values(by = ['BlackCat_Orgs', 'TIRCP_Orgs'],ascending = True)

In [None]:
# merge1[['TIRCP_Orgs', 'BC_TIRCP_merge']].dropna().sort_values('TIRCP_Orgs')

In [None]:
# merge1[['BlackCat_Orgs','BC_TIRCP_merge']].dropna().sort_values('BlackCat_Orgs')

#### Add district information for BlackCat agencies

In [None]:
agency_district_df = portfolio_utils.add_caltrans_district()

##### Add agency names/cal itp id first 


In [None]:
agency_calitp_df = portfolio_utils.add_agency_name()

In [None]:
# Clean org names
agency_calitp_df = organization_cleaning(agency_calitp_df, "calitp_agency_name")

In [None]:
# agency_calitp_df.calitp_agency_name.sort_values().unique().tolist()

In [None]:
# Grab grant recipients from blackcat
blackcat_agencies = merge1.BlackCat_Orgs.dropna().tolist()

In [None]:
# Using fuzzy matching, preview the agency names in agency_calitp_df that might match black cat agencies
for i in blackcat_agencies:
    replace_matches_in_column(
        agency_calitp_df,
        "calitp_agency_name",
        "organization_calitp_fuzzy_matching",
        i,
        90,
    )

In [None]:
# agency_calitp_df[['calitp_agency_name', 'organization_calitp_fuzzy_matching']].dropna()

In [None]:
# Change some of the calitp_agency values
crosswalk_calitp_bc = {
    "Glenn County Transit": "Glenn County Transportation Commission",
    "Kern Transit": " Kern Regional Transit",
    "Lake Transit": "Lake Transit Authority",
    "Santa Rosa Citybus": "City Of Santa Rosa",
    "Tulare County Area Transit": "Tulare County Regional Transit Agency",
    "Victor Valley Transit": "Victor Valley Transit Authority",
    "Yolobus":"Yolo County Transportation District"
}

In [None]:
# Replace crosswalk
agency_calitp_df["calitp_agency_name"] = agency_calitp_df["calitp_agency_name"].replace(
    crosswalk_calitp_bc
)

In [None]:
merge2 = pd.merge(
    merge1,
    agency_calitp_df,
    how="left",
    left_on=["BlackCat_Orgs"],
    right_on=["calitp_agency_name"],
    indicator=True,
)

In [None]:
merge2._merge.value_counts()

In [None]:
# Drop merge indicator
merge2 = merge2.drop(columns = ["_merge"])

##### Add districts based on Cal ITP ID

In [None]:
merge3 = pd.merge(
    merge2,
    agency_district_df,
    how="left",
    left_on=["calitp_itp_id"],
    right_on=["calitp_itp_id"],
)

In [None]:
# For NaN values in caltrans_district col, fill in with district info from TIRCP
merge3.caltrans_district= (
    merge3.caltrans_district.fillna(merge3["district"])
)

In [None]:
# Drop cols
merge4 = merge3.drop(columns =['organization_calitp_fuzzy_matching', 'calitp_agency_name', "district" ])

In [None]:
merge4.caltrans_district.isna().sum()

In [None]:
merge4

In [None]:
merge4.to_parquet('gs://calitp-analytics-data/data-analyses/grants/BlackCat_TIRCP_Dec_2022.parquet')