## BlackCat Organizations
5/19
* Most recent records
* Transit agencies that have fixed-route services (more than the vendors contracted to  provide dial-a-ride / paratransit services—I recall there being a bunch of those in the  BlackCat export I got last year).
* 5311, SGR then + Clovis

In [104]:
import fuzzywuzzy
import pandas as pd
import siuba  # need this to do type hint in functions
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from fuzzywuzzy import process
from siuba import *

In [105]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [106]:
gcs_path = "gs://calitp-analytics-data/data-analyses/grant_misc/"

In [107]:
#BlackCat_Grants_Projects_5_22_23.xlsx

### Blackcat

In [108]:
def summarize_rows(df, col_to_group: str, col_to_summarize: str) -> pd.DataFrame:
    """
    Puts all the elements in the column "col to summarize"
    onto one line and separates them by commas.
    """
    df = df.groupby(col_to_group)[col_to_summarize].apply(",".join).reset_index()
    return df

In [109]:
def blackcat_orgs(file_name:str, year_wanted: int, grants_wanted: list)-> pd.DataFrame:
    """
    Open and filter blackcat file for the grant applicants. 
    
    Args:
        file_name (str): include .xlsx extension
        year_wanted (int): filter for records beyond a certain year
        grants_wanted (list): list of grant programs to subset
    """
    df = to_snakecase(pd.read_excel(f"{gcs_path}{file_name}"))
    
    # Filter grant fiscal year
    df = df[df.grant_fiscal_year >= 2018].reset_index(drop=True)
    
    df = df[df.funding_program.isin(grants_subset)].reset_index(drop = True)
    
    # Cols
    subset = ["organization_name", "grant_fiscal_year", "funding_program"]
    sort_cols = ["organization_name", "funding_program"]
    
    # Summarize df so one row will correspond with one organization
    df = (df[subset]
    .sort_values(by=["organization_name", "grant_fiscal_year"], ascending=[True, False])
    .drop_duplicates(subset=sort_cols)
    .reset_index(drop=True)
    )
    
    df = summarize_rows(df, ["organization_name", "grant_fiscal_year"], "funding_program")
    
    # Drop extra rows
    df = (df
    .sort_values(by=sort_cols, ascending=[True, False])
    .drop_duplicates(subset=["organization_name"])
    .reset_index(drop=True))
    return df 

In [110]:
#blackcat = to_snakecase(
 #   pd.read_excel(f"{gcs_path}BlackCat_Grants_Projects_5_22_23.xlsx")
# )

In [111]:
#blackcat.sample()

In [112]:
#blackcat.shape

In [113]:
# Filter grant fiscal year
#blackcat2 = blackcat[blackcat.grant_fiscal_year >= 2018].reset_index(drop=True)

In [114]:
#blackcat2.shape, blackcat2.grant_fiscal_year.value_counts()

In [115]:
#blackcat2.columns

In [116]:
grants_subset = [
    "5311(f) Cont",
    "CMAQ (FTA 5311)",
    "Section 5311",
    "5311(f) Round 2",
    "5339 (State)",
    "Section 5311(f)",
]

In [117]:
# blackcat2 = blackcat2[blackcat2.funding_program.isin(grants_subset)]

In [118]:
# len(organizations)

In [119]:
#organizations = (
#    organizations.sort_values(
#        by=["organization_name", "grant_fiscal_year"], ascending=[True, False]
#    )
#    .drop_duplicates(subset=["organization_name"])
#    .reset_index(drop=True)
#)

In [120]:
#organizations.shape, organizations.organization_name.nunique()

### State of Good Repair

In [121]:
def sgr_orgs(file_name:str) ->  pd.DataFrame:
    """
    Open and filter State of Good Repair file for the grant applicants. 
    
    Args:
        file_name (str): include excel extension
    """
    df = to_snakecase(pd.read_excel(f"{gcs_path}{file_name}"))
    
    # Subset
    sgr_subset = ["first_name", "last_name", "email", "phone", "title", "agency"]
    df = df[sgr_subset]
    
    # Keep only one row for each agency
    df = df.drop_duplicates("agency").reset_index(drop=True)
    
    # Col to specify this is State of Good Repair data
    df["funding_program"] = "State of Good Repair"
    
    return df

In [122]:
#sgr = to_snakecase(pd.read_excel(f"{gcs_path}SGR Calsmart-user-list request.xls"))

In [123]:
# sgr_subset = ["first_name", "last_name", "email", "phone", "title", "agency"]

In [124]:
#sgr2 = sgr[sgr_subset]

In [125]:
#sgr2 = sgr2.drop_duplicates("agency").reset_index(drop=True)

In [126]:
#sgr2.agency.value_counts().head()

In [127]:
#len(sgr2), len(sgr)

In [128]:
#sgr2["funding_program"] = "State of Good Repair"

In [129]:
#sgr2.shape, sgr.agency.nunique()

### Merge BlackCat w/ SGR

In [130]:
def clean_punctuation(df, agency_col: str) -> pd.DataFrame:
    """
    Cleans up agency names. Assume anything after comma/()/
    ; are acronyms and delete them. Correct certain mispellings.
    Change agency names to title case. Clean whitespaces.
    """
    df[agency_col] = (
        df[agency_col]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.split(";")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.replace("*", "")
        .str.replace("Agency", "")
        .str.strip()  # strip whitespaces again after getting rid of certain things
    )
    return df

In [131]:
def flip_county_city(df, agency_col: str):
    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
    to_correct = df[
        (df[agency_col].str.contains("County")) | (df[agency_col].str.contains("City"))
    ]
    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop=True)
    to_correct["str_len"] = to_correct[agency_col].str.split().str.len()
    to_correct = to_correct[to_correct.str_len <= 5].reset_index(drop=True)
    to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(
        " Of ", 1, expand=True
    )
    to_correct["new_name"] = to_correct["name_pt2"] + " " + to_correct["name_pt1"]

    new_names_dictionary = dict(to_correct[[agency_col, "new_name"]].values)
    df["agency_corrected"] = df[agency_col].map(new_names_dictionary)
    df["agency_corrected"] = df["agency_corrected"].fillna(df[agency_col])

    df = df.drop(columns=[agency_col])
    df = df.rename(columns={"agency_corrected": agency_col})

    return df

In [132]:
def clean_organization_names(df, agency_col: str):
    df = clean_punctuation(df, agency_col)
    df = flip_county_city(df, agency_col)
    return df

In [133]:
# organizations = clean_organization_names(organizations, "organization_name")

In [134]:
# len(organizations)

In [135]:
# sgr2 = clean_organization_names(sgr2, "agency")

In [136]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [137]:
def find_fuzzy_match(
    df1,
    df2,
    df1_fuzzy_column: str,
    df2_fuzzy_column: str,
    new_column: str,
    min_ratio: int)->pd.DataFrame:
    """
    Match df2 values against df1 values in a new column
    """
    unique_values = df1[df1_fuzzy_column].unique().tolist()
    for i in unique_values:
        replace_matches_set_ratio(df2, df2_fuzzy_column, new_column, i, min_ratio)
    return df2

In [155]:
def fuzzy_match_sgr_bc(blackcat_df: pd.DataFrame, sgr_df:pd.DataFrame, matches_to_delete:list) -> pd.DataFrame:
    blackcat_df = clean_organization_names(blackcat_df, "organization_name")
    sgr_df = clean_organization_names(sgr_df, "agency")
    
    # sgr_df is the "source of truth" for organization name
    # goal is to change the names within blackcat_df
    matches = find_fuzzy_match(
    sgr_df, blackcat_df, "agency", "organization_name", "fuzzy_match_agency", 95
    )
    
    # Some matches are not correct, set them as none
    for i in matches_to_delete:
        matches.loc[matches["organization_name"].eq(i), "fuzzy_match_agency"] = None
    
    # Fill in any organizations that didn't get a match
    matches.fuzzy_match_agency = matches.fuzzy_match_agency.fillna(matches.organization_name)
    
    return matches

In [139]:
#organizations = find_fuzzy_match(
#    sgr, organizations, "agency", "organization_name", "fuzzy_match_agency", 95
#)

In [140]:
#organizations

In [141]:
#for i in fuzzy_to_del:
#    organizations.loc[
#        organizations["organization_name"].eq(i), "fuzzy_match_agency"
#    ] = None

In [142]:
#organizations.fuzzy_match_agency = organizations.fuzzy_match_agency.fillna(
 #   organizations.organization_name
#)

In [143]:
# organizations.sort_values(by = ['organization_name'])

In [144]:
# organizations = organizations.drop(columns = ['organization_name']).rename(columns = {'fuzzy_match_agency':'organization_name'})

In [145]:
blackcat = blackcat_orgs(file_name = "BlackCat_Grants_Projects_5_22_23.xlsx",
                             year_wanted = 2018,
                             grants_wanted = grants_subset)

In [146]:
blackcat.head()

Unnamed: 0,organization_name,grant_fiscal_year,funding_program
0,Alpine County Community Development,2022,Section 5311
1,Amador Transit,2020,CMAQ (FTA 5311)
2,Butte County Association of Governments/ Butte Regional Transit,2022,"Section 5311,Section 5311(f)"
3,Calaveras County Public Works,2018,5311(f) Cont
4,Calaveras Transit Agency,2022,Section 5311


In [150]:
 sgr = sgr_orgs(file_name = "SGR Calsmart-user-list request.xls")

In [151]:
sgr.head()

Unnamed: 0,first_name,last_name,email,phone,title,agency,funding_program
0,Eve,Ng,grants@actransit.org,5108915405,Capital Planning and Grants Manager,Alameda-Contra Costa Transit District,State of Good Repair
1,Ethan,Gray,egray@alpinecountyca.gov,5306942140,Community Development Deputy Director,Alpine County,State of Good Repair
2,Scott,Maas,smaas@citlink.net,5302600991,Transportation Program Manager,Alpine County Local Transportation Commission,State of Good Repair
3,Patricia,Amarant,maggie@amadortransit.com,2092675079,General Manager,Amador Transit,State of Good Repair
4,Judy,Fry,jfry@avta.com,6617292234,Chief Financial Officer,Antelope Valley Transit Authority,State of Good Repair


In [164]:
fuzzy_to_del = [
    "Amador Transit",
    "Eastern Contra Costa Transit Authority",
    "Madera County",
]

In [165]:
blackcat = fuzzy_match_sgr_bc(blackcat, sgr, fuzzy_to_del)

  df[agency_col]
  to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(
  df[agency_col]
  to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(


In [166]:
def merge_blackcat_sgr(blackcat_df:pd.DataFrame, sgr_df:pd.DataFrame)->pd.DataFrame:
   
    # Merge
    m1 = pd.merge(
    blackcat,
    sgr,
    left_on=["fuzzy_match_agency"],
    right_on=["agency"],
    how="outer",
    indicator=True,)
    
    # Fill the funding programs that are empty
    m1.funding_program_x = m1.funding_program_x.fillna("State of Good Repair")
    m1.funding_program_y = m1.funding_program_y.fillna(m1.funding_program_x)
    
    # Combine the two funding programs since an applicant
    # can appear in both Blackcat and SGR
    m1["funding_program"] = m1.funding_program_x + "," + m1.funding_program_y
    m1.funding_program = m1.funding_program.fillna(m1.funding_program_y)
    
    # Fill in organization name that are null. These rows are SGR ones
    m1.organization_name = m1.organization_name.fillna(m1.agency)
    
    # Subset
    cols_to_drop = [
    "funding_program_x",
    "funding_program_y",
    "fuzzy_match_agency",
    "agency",
    "grant_fiscal_year"]
    
    m1 = m1.drop(columns=cols_to_drop)
    
    # Summarize so one row contains one organization and all funding programs
    # it has applied for.
    m2 = summarize_rows(m1,
    ["organization_name"],
    "funding_program")
    
    m2 = pd.merge(m2, m1.drop(columns = ['funding_program']), on = ['organization_name'], how = "left")
    m2 = m2.sort_values(['organization_name']).drop_duplicates(subset = ['organization_name'])
    return m2
    

In [167]:
test = merge_blackcat_sgr(blackcat, sgr)

### Airtable
* Grab only fixed route providers.

In [179]:
def airtable_orgs()->pd.DataFrame:
    df = tbls.external_airtable.california_transit__services() >> collect()
    
    airtable_subset = ["name", "service_type"]
    
    df = df[airtable_subset]
    
    # Service types nested in a list. explode out
    df = df.explode("service_type").reset_index(drop=True)
    
    df = df.drop_duplicates().reset_index(drop = True)
    
    # Summarize so one row contains one organization and all funding programs
    # it has applied for.
    df = df.fillna('No Service Info')
    df = summarize_rows(df, ["name"], "service_type")
    
    return df

In [180]:
airtable = airtable_orgs()

In [178]:
airtable.shape, airtable.name.nunique()

((995, 2), 995)

In [None]:
# airtable = tbls.external_airtable.california_transit__services() >> collect()

In [None]:
# airtable.columns

In [None]:
# ['name','service_type', 'service_operator_type', 'operator_organization_type']
# airtable_subset = ["name", "service_type"]

In [None]:
# airtable2 = airtable[airtable_subset]

In [None]:
# airtable2.name.nunique()

In [None]:
# airtable2.info()

In [None]:
# airtable2.sort_values(["name"]).head()

In [None]:
# airtable3 = airtable2.explode("service_type").reset_index(drop=True)

In [None]:
# airtable3.head()

In [None]:
# airtable3.service_type = airtable3.service_type.fillna("NA")

In [None]:
# fixed route only
#airtable4 = airtable3[airtable3.service_type.str.lower().str.contains("fixed")]

### Merge

In [184]:
def fuzzy_match_airtable_bc(merged_blackcat_sgr:pd.DataFrame)->pd.DataFrame:
    
    airtable = airtable_orgs()
    airtable = clean_organization_names(airtable5, "name")
    
    merged_blackcat_sgr = find_fuzzy_match(airtable5, merged_blackcat_sgr, "name", "organization_name", "fuzzy_agency", 95)
    
    wrong_matches = [
    "Eastern Contra Costa Transit Authority",
    "Fresno County",
    "Eastern Sierra Transit Authority",
    "Alpine County Local Transportation Commission"
    "Livermore Amador Valley Transit Authority",
    "Calaveras Transit",
    "City Of Corcoran - Corcoran Area Transit",]
    
    for i in wrong_matches:
        merged_blackcat_sgr.loc[merged_blackcat_sgr["organization_name"].eq(i), "fuzzy_agency"] = None
        
    # These are the fuzzy matches that worked.
    found_matches = (merged_blackcat_sgr[((~merged_blackcat_sgr.fuzzy_agency.isna()) 
                    & (~merged_blackcat_sgr.fuzzy_agency.isin(fuzzy_matches_to_filter)))]).reset_index(drop=True)
    
    # Organizations that still need matches
    still_need_matches = merged_blackcat_sgr[(~merged_blackcat_sgr.organization_name.isin(found_matches.organization_name.tolist()))]
    
    # Clean up
    found_matches = found_matches.drop(columns = ['organization_name']).rename(columns = {'fuzzy_agency':'organization_name'})
    
    return found_matches, still_need_matches

In [181]:
# airtable5 = clean_organization_names(airtable5, "name")

In [None]:
#airtable5.sort_values('name').head(20)

In [182]:
# m2 = find_fuzzy_match(airtable5, m2, "name", "organization_name", "fuzzy_agency", 95)

In [None]:
for i in [
    "Eastern Contra Costa Transit Authority",
    "Fresno County",
    "Livermore Amador Valley Transit Authority",
    "Eastern Sierra Transit Authority",
    "Alpine County Local Transportation Commission"
]:
    m2.loc[m2["organization_name"].eq(i), "fuzzy_agency"] = None

In [None]:
m2.loc[m2.fuzzy_agency.notna()][["organization_name", "fuzzy_agency"]].sort_values(
    ["organization_name"]
)

In [None]:
# Reverse -> replace
"""
for i in airtable5.name.unique().tolist():
       replace_matches_set_ratio(
        organizations, "organization_name", "project_name_fuzzy", i, 95) """

In [None]:
# Fuzzy matches that didn't work
# Use organization name
fuzzy_matches_to_filter = [
    "Eastern Contra Costa Transit Authority",
    "Livermore Amador Valley Transit Authority",
    "Calaveras Transit",
    "City Of Corcoran - Corcoran Area Transit",
]

In [None]:
len(m2)

In [None]:
# These are the fuzzy matches that worked.
found_matches = (
    m2[((~m2.fuzzy_agency.isna()) & (~m2.fuzzy_agency.isin(fuzzy_matches_to_filter)))]
).reset_index(drop=True)

In [None]:
len(found_matches)

In [None]:
still_need_matches = m2[(~m2.organization_name.isin(found_matches.organization_name.tolist()))]

In [None]:
found_matches = found_matches.drop(columns = ['organization_name']).rename(columns = {'fuzzy_agency':'organization_name'})

In [None]:
found_matches

* Calaveras Transit change to Calaveras Connect
* Arvin City  Arvin Transit
* Auburn City Auburn Transit
* County Of Los Angeles - Department Of Public Works, Los Angeles County Transit Services
* 34	County Of Sacramento Department Of Transportation Sacrt Bus
* County Of Shasta Department Of Public Works
* Dinuba City Dinuba Connection
* Lassen Transit Service Lassen Rural Bus
* Needles City  Needles Area Transit
* Nevada Public Works County Nevada County Connects
* Ojai City Ojai Trolley
* Palo Verde Valley Transit Palos Verdes Peninsula Transit Authority
* Placer County Public Works Placer County Transit
* Plumas County Transportation Commission Plumas Transit Systems
* Porterville City  Porterville Transit
* Ridgecrest City Ridgecrest Transit
* Rio Vista City Rio Vista Delta Breeze
* Santa Maria City Santa Maria Regional Transit
* Siskiyou County Siskiyou Transit And General Express
* Stanislaus County Public Works - Transit Division  Stanislaus Regional Transit Authority
* Taft City Taft Area Transit
* Tehama County Transit  Tehama Rural Area Express
* Transportation Trinity County Department Trinity Transit
* Transit Joint Powers Authority For Merced County Merced The Bus
* Visalia City Visilia Transit
* Yolo County Transportation District Yolobus


In [None]:
to_map = {
    "Tulare County": "Tuolumne County Transit",
    "Turlock City": "Turlock Transit",
    "Union City City": "Union City Transit",
    "Calaveras Transit": "Calaveras Connect",
    "Alameda-Contra Costa Transit District": "Ac Transit",
    "Arcadia City": "Arcadia Transit",
    "Banning City": "Banning Pass Transit",
    "Beaumont City": "Beaumont Pass Transit",
    "Calaveras Council Of Governments": "Calaveras Connect",
    "Camarillo City": "Camarillo Area Transit",
    "Commerce City": "Commerce Municipal Bus Lines",
    "Corona City": "Corona Cruiser",
    "Delano City": "Delano Area Rapid Transit",
    "Eastern Sierra Transit Authority": "Eastern Sierra Transit Authority Community Routes",
    "Elk Grove City": "Elk Grove Transit Services",
    "Fairfield City": "Fairfield And Suisun Transit",
    "Folsom City": "Folsom Stage Line",
    "Glenn County": "Glenn Ride",
    "Guadalupe City": "Guadalupe Flyer",
    "Lassen County": "Lassen Rural Bus",
    "Marin County Transit District": "Marin Transit",
    "Madera County": "Madera Metro",
    "Mariposa County": "Mariposa Grove Shuttle",
    "Morro Bay City": "Morro Bay Transit",
    "Norwalk City": "Norwalk Transit System",
    "Roseville City": "Roseville Transit",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District Bus",
    "San Diego City": "San Diego Trolley",
    "San Francisco City": "Muni Bus",
    "Santa Rosa City": "Santa Rosa Citybus",
    "Shafter City": "Shafter Dial-A-Ride",
    "Sierra County": "Sierra Point Shuttle",
    "Simi Valley City": "Simi Valley Transit",
    "Sonoma Marin Area Rail Transit": "Sonoma-Marin Area Rail Transit",
    "Arvin City": "Arvin Transit",
    "Auburn City": "Auburn Transit",
    "County Of Los Angeles - Department Of Public Works": "Los Angeles County Transit Services",
    "County Of Sacramento Department Of Transportation": "Sacrt Bus",
    "Dinuba City": "Dinuba Connection",
    "Lassen Transit Service": "Lassen Rural Bus",
    "Needles City": "Needles Area Transit",
    "Nevada Public Works": "County Nevada County Connects",
    "Ojai City": "Ojai Trolley",
    "Palo Verde Valley Transit": "Palos Verdes Peninsula Transit Authority",
    "Placer County Public Works": "Placer County Transit",
    "Plumas County Transportation Commission": "Plumas Transit Systems",
    "Porterville City": "Porterville Transit",
    "Ridgecrest City": "Ridgecrest Transit",
    "Rio Vista City": "Rio Vista Delta Breeze",
    "Santa Maria City": "Santa Maria Regional Transit",
    "Siskiyou County": "Siskiyou Transit And General Express",
    "Taft City": "Taft Area Transit",
    "Tehama County Transit": "Tehama Rural Area Express",
    "Transportation Trinity County Department": "Trinity Transit",
    "Transit Joint Powers Authority For Merced": "County Merced The Bus",
    "Visalia City": "Visilia Transit",
    "Yolo County Transportation District": "Yolobus",
}

In [None]:
still_need_matches.organization_name = still_need_matches.organization_name.replace(
    to_map
)

In [None]:
still_need_matches.head()

In [None]:
# Concat
blackcat_cleaned = pd.concat([found_matches, still_need_matches], axis=0)

In [None]:
len(blackcat_cleaned)

In [None]:
blackcat_cleaned.organization_name = blackcat_cleaned.organization_name.replace(
    to_map
)

In [None]:
m3 = pd.merge(
    blackcat_cleaned,
    airtable5,
    how="left",
    left_on="organization_name",
    right_on="name",
)

In [None]:
m3.service_type = m3.service_type.fillna("no service info")

In [None]:
m3 = m3.drop(columns=["name"])

In [None]:
m3.service_type.value_counts()

In [None]:
m3 = m3.fillna("NA")

In [None]:
final_subset = [
    "funding_program",
    "organization_name",
    "first_name",
    "last_name",
    "email",
    "phone",
    "title",
    "service_type",
]

In [None]:
m3 = m3[final_subset]

In [None]:
def delete_repeated_element(df, col: str):
    df[col] = (
        df[col]
        .apply(lambda x: ", ".join(set([y.strip() for y in x.split(",")])))
        .str.strip()
    )
    return df

In [None]:
m3.service_type = m3.service_type.str.title()

In [None]:
m3 = delete_repeated_element(m3, "funding_program")

In [None]:
m3 = m3.drop_duplicates(subset = ['organization_name']).sort_values(["organization_name"]).reset_index(drop=True)

In [None]:
m3.shape, m3.organization_name.nunique()

In [None]:
m3.head()

In [None]:
m3.to_excel(f"{gcs_path}5311_SGR_Recipients.xlsx")

In [None]:
m3.organization_name.tolist()