## BlackCat Organizations
5/19
* Most recent records
* Transit agencies that have fixed-route services (more than the vendors contracted to  provide dial-a-ride / paratransit services—I recall there being a bunch of those in the  BlackCat export I got last year).
* 5311, SGR then + Clovis

In [1]:
import fuzzywuzzy
import pandas as pd
import siuba  # need this to do type hint in functions
from calitp_data_analysis.sql import to_snakecase
from calitp_data_analysis.tables import tbls
from fuzzywuzzy import process
from siuba import *


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas  # type: ignore


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
gcs_path = "gs://calitp-analytics-data/data-analyses/grant_misc/"

### Blackcat

In [4]:
blackcat = to_snakecase(
    pd.read_excel(f"{gcs_path}BlackCat_Grants_Projects_5_22_23.xlsx")
)

In [5]:
blackcat.sample()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,grant_encumbered_amount,local_encumbered_amount,total_encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
1484,2019,5310 Trad,CA-2020-244 | 0020000273-T,2019,"Pride Industries One, Inc.",BCG0001688,Purchase Replacement < 30 Ft Bus,111204,64AC19-01218,64960.0,78411.0,0.0,78411.0,78410.76,-13450.76,0.0,Open,,,


In [6]:
blackcat.shape

(3145, 20)

In [7]:
# Filter grant fiscal year
blackcat2 = blackcat[blackcat.grant_fiscal_year >= 2018].reset_index(drop=True)

In [8]:
blackcat2.shape, blackcat2.grant_fiscal_year.value_counts()

((2265, 20),
 2019    885
 2021    672
 2020    339
 2022    207
 2018    162
 Name: grant_fiscal_year, dtype: int64)

In [9]:
blackcat2.columns

Index(['grant_fiscal_year', 'funding_program', 'grant_number', 'project_year',
       'organization_name', 'upin', 'description', 'ali', 'contract_number',
       'allocationamount', 'grant_encumbered_amount',
       'local_encumbered_amount', 'total_encumbered_amount', 'expendedamount',
       'activebalance', 'closedoutbalance', 'project_status',
       'project_closed_by', 'project_closed_date', 'project_closed_time'],
      dtype='object')

In [10]:
grants_subset = [
    "5311(f) Cont",
    "CMAQ (FTA 5311)",
    "Section 5311",
    "5311(f) Round 2",
    "5339 (State)",
    "Section 5311(f)",
]

In [11]:
blackcat2 = blackcat2[blackcat2.funding_program.isin(grants_subset)]

In [12]:
# Subset to only organizations
organizations = (
    blackcat2[["organization_name", "grant_fiscal_year", "funding_program"]]
    .sort_values(by=["organization_name", "grant_fiscal_year"], ascending=[True, False])
    .drop_duplicates(subset=["organization_name", "funding_program"])
    .reset_index(drop=True)
)

In [13]:
len(organizations)

177

In [14]:
def summarize_rows(df, col_to_group: str, col_to_summarize: str) -> pd.DataFrame:
    """
    Puts all the elements in the column "col to summarize"
    onto one line and separates them by commas.
    """
    df = df.groupby(col_to_group)[col_to_summarize].apply(",".join).reset_index()
    return df

In [15]:
organizations = summarize_rows(
    organizations, ["organization_name", "grant_fiscal_year"], "funding_program"
)

In [16]:
organizations = (
    organizations.sort_values(
        by=["organization_name", "grant_fiscal_year"], ascending=[True, False]
    )
    .drop_duplicates(subset=["organization_name"])
    .reset_index(drop=True)
)

In [17]:
organizations.shape, organizations.organization_name.nunique()

((92, 3), 92)

In [18]:
organizations

Unnamed: 0,organization_name,grant_fiscal_year,funding_program
0,Alpine County Community Development,2022,Section 5311
1,Amador Transit,2022,"5339 (State),Section 5311"
2,Butte County Association of Governments/ Butte Regional Transit,2022,"Section 5311,Section 5311(f)"
3,Calaveras County Public Works,2018,5311(f) Cont
4,Calaveras Transit Agency,2022,Section 5311
5,City of Arcata,2019,Section 5311
6,City of Arvin,2022,Section 5311
7,City of Auburn,2022,Section 5311
8,City of California City,2022,Section 5311
9,City of Chowchilla,2022,Section 5311


### State of Good Repair

In [19]:
sgr = to_snakecase(pd.read_excel(f"{gcs_path}SGR Calsmart-user-list request.xls"))

In [20]:
sgr_subset = ["first_name", "last_name", "email", "phone", "title", "agency"]

In [21]:
sgr2 = sgr[sgr_subset]

In [22]:
sgr2 = sgr2.drop_duplicates("agency").reset_index(drop=True)

In [23]:
sgr2.agency.value_counts().head()

Alameda-Contra Costa Transit District       1
San Diego Association of Governments        1
Morongo Basin Transit Authority             1
Mountain Area Regional Transit Authority    1
Napa Valley Transportation Authority        1
Name: agency, dtype: int64

In [24]:
len(sgr2), len(sgr)

(195, 474)

In [25]:
sgr2["funding_program"] = "State of Good Repair"

In [26]:
sgr2.shape, sgr.agency.nunique()

((195, 7), 195)

### Merge BlackCat w/ SGR

In [27]:
def clean_punctuation(df, agency_col: str) -> pd.DataFrame:
    """
    Cleans up agency names. Assume anything after comma/()/
    ; are acronyms and delete them. Correct certain mispellings.
    Change agency names to title case. Clean whitespaces.
    """
    df[agency_col] = (
        df[agency_col]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.split(";")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.replace("*", "")
        .str.replace("Agency", "")
        .str.strip()  # strip whitespaces again after getting rid of certain things
    )
    return df

In [28]:
def flip_county_city(df, agency_col: str):
    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb
    to_correct = df[
        (df[agency_col].str.contains("County")) | (df[agency_col].str.contains("City"))
    ]
    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop=True)
    to_correct["str_len"] = to_correct[agency_col].str.split().str.len()
    to_correct = to_correct[to_correct.str_len <= 5].reset_index(drop=True)
    to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(
        " Of ", 1, expand=True
    )
    to_correct["new_name"] = to_correct["name_pt2"] + " " + to_correct["name_pt1"]

    new_names_dictionary = dict(to_correct[[agency_col, "new_name"]].values)
    df["agency_corrected"] = df[agency_col].map(new_names_dictionary)
    df["agency_corrected"] = df["agency_corrected"].fillna(df[agency_col])

    df = df.drop(columns=[agency_col])
    df = df.rename(columns={"agency_corrected": agency_col})

    return df

In [29]:
def clean_organization_names(df, agency_col: str):
    df = clean_punctuation(df, agency_col)
    df = flip_county_city(df, agency_col)
    return df

In [30]:
organizations = clean_organization_names(organizations, "organization_name")

  df[agency_col]
  to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(


In [31]:
len(organizations)

92

In [32]:
sgr2 = clean_organization_names(sgr2, "agency")

  df[agency_col]
  to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(


In [33]:
def replace_matches_set_ratio(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_set_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [34]:
def find_fuzzy_match(
    df1,
    df2,
    df1_fuzzy_column: str,
    df2_fuzzy_column: str,
    new_column: str,
    min_ratio: int,
):
    unique_values = df1[df1_fuzzy_column].unique().tolist()
    for i in unique_values:
        replace_matches_set_ratio(df2, df2_fuzzy_column, new_column, i, min_ratio)
    return df2

In [35]:
organizations = find_fuzzy_match(
    sgr, organizations, "agency", "organization_name", "fuzzy_match_agency", 95
)

In [36]:
organizations

Unnamed: 0,grant_fiscal_year,funding_program,organization_name,fuzzy_match_agency
0,2022,Section 5311,Alpine County Community Development,Alpine County
1,2022,"5339 (State),Section 5311",Amador Transit,Livermore-Amador Valley Transit Authority
2,2022,"Section 5311,Section 5311(f)",Butte County Association Of Governments Butte Regional Transit,Butte County Association of Governments
3,2018,5311(f) Cont,Calaveras County Public Works,Calaveras County
4,2022,Section 5311,Calaveras Transit,
5,2019,Section 5311,Arcata City,City of Arcata
6,2022,Section 5311,Arvin City,City of Arvin
7,2022,Section 5311,Auburn City,City of Auburn
8,2022,Section 5311,California City City,City of California City
9,2022,Section 5311,Chowchilla City,City of Chowchilla


In [37]:
fuzzy_to_del = [
    "Amador Transit",
    "Eastern Contra Costa Transit Authority",
    "Madera County",
]

In [38]:
for i in fuzzy_to_del:
    organizations.loc[
        organizations["organization_name"].eq(i), "fuzzy_match_agency"
    ] = None

In [39]:
organizations.fuzzy_match_agency = organizations.fuzzy_match_agency.fillna(
    organizations.organization_name
)

In [40]:
# organizations.sort_values(by = ['organization_name'])

In [41]:
# organizations = organizations.drop(columns = ['organization_name']).rename(columns = {'fuzzy_match_agency':'organization_name'})

In [42]:
m1 = pd.merge(
    organizations,
    sgr2,
    left_on=["fuzzy_match_agency"],
    right_on=["agency"],
    how="outer",
    indicator=True,
)

In [43]:
m1._merge.value_counts()

right_only    150
left_only      47
both           45
Name: _merge, dtype: int64

In [44]:
m1.funding_program_x = m1.funding_program_x.fillna("State of Good Repair")

In [45]:
m1.funding_program_y = m1.funding_program_y.fillna(m1.funding_program_x)

In [46]:
m1["funding_program"] = m1.funding_program_x + "," + m1.funding_program_y

In [47]:
m1.shape, m1.organization_name.nunique()

((242, 13), 92)

In [48]:
m1.organization_name = m1.organization_name.fillna(m1.agency)

In [49]:
m1.sort_values(by = ['organization_name', 'agency'])

Unnamed: 0,grant_fiscal_year,funding_program_x,organization_name,fuzzy_match_agency,first_name,last_name,email,phone,title,funding_program_y,agency,_merge,funding_program
92,,State of Good Repair,Alameda-Contra Costa Transit District,,Eve,Ng,grants@actransit.org,5108915405.0,Capital Planning and Grants Manager,State of Good Repair,Alameda-Contra Costa Transit District,right_only,"State of Good Repair,State of Good Repair"
0,2022.0,Section 5311,Alpine County Community Development,Alpine County,Ethan,Gray,egray@alpinecountyca.gov,5306942140.0,Community Development Deputy Director,State of Good Repair,Alpine County,both,"Section 5311,State of Good Repair"
93,,State of Good Repair,Alpine County Local Transportation Commission,,Scott,Maas,smaas@citlink.net,5302600991.0,Transportation Program Manager,State of Good Repair,Alpine County Local Transportation Commission,right_only,"State of Good Repair,State of Good Repair"
1,2022.0,"5339 (State),Section 5311",Amador Transit,Amador Transit,Patricia,Amarant,maggie@amadortransit.com,2092675079.0,General Manager,State of Good Repair,Amador Transit,both,"5339 (State),Section 5311,State of Good Repair"
94,,State of Good Repair,Antelope Valley Transit Authority,,Judy,Fry,jfry@avta.com,6617292234.0,Chief Financial Officer,State of Good Repair,Antelope Valley Transit Authority,right_only,"State of Good Repair,State of Good Repair"
100,,State of Good Repair,Arcadia City,,Jayme,Admin,supercali707@gmail.com,7076854324.0,Admin Tester,State of Good Repair,Arcadia City,right_only,"State of Good Repair,State of Good Repair"
101,,State of Good Repair,Arcata City,,Marcela,Jimenez,mjimenez@cityofarcata.org,5107349099.0,Engineering Aide,State of Good Repair,Arcata City,right_only,"State of Good Repair,State of Good Repair"
5,2019.0,Section 5311,Arcata City,City of Arcata,,,,,,Section 5311,,left_only,"Section 5311,Section 5311"
102,,State of Good Repair,Arvin City,,Jeff,Jones,jeffjones@arvin.org,6618543134.0,Finance Director,State of Good Repair,Arvin City,right_only,"State of Good Repair,State of Good Repair"
6,2022.0,Section 5311,Arvin City,City of Arvin,,,,,,Section 5311,,left_only,"Section 5311,Section 5311"


In [50]:
m1.funding_program = m1.funding_program.fillna(m1.funding_program_y)

In [51]:
cols_to_drop = [
    "funding_program_x",
    "funding_program_y",
    "fuzzy_match_agency",
    "agency",
    "grant_fiscal_year",
]

In [52]:
m1 = m1.drop(columns=cols_to_drop)

In [53]:
m1.organization_name.nunique(), m1.shape

(212, (242, 8))

In [54]:
m2 = summarize_rows(
    m1,
    ["organization_name"],
    "funding_program",
)

In [55]:
m2 = pd.merge(m2, m1.drop(columns = ['funding_program']), on = ['organization_name'], how = "left")

In [56]:
m2 = m2.sort_values(['organization_name']).drop_duplicates(subset = ['organization_name'])

In [57]:
m2.organization_name.nunique(), m2.shape

(212, (212, 8))

### Airtable
* Grab only fixed route providers.

In [58]:
airtable = tbls.external_airtable.california_transit__services() >> collect()

In [59]:
# airtable.columns

In [60]:
# ['name','service_type', 'service_operator_type', 'operator_organization_type']
airtable_subset = ["name", "service_type"]

In [61]:
airtable2 = airtable[airtable_subset]

In [62]:
airtable2.name.nunique()

994

In [63]:
airtable2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291736 entries, 0 to 291735
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          291735 non-null  object
 1   service_type  291736 non-null  object
dtypes: object(2)
memory usage: 4.5+ MB


In [64]:
airtable2.sort_values(["name"]).head()

Unnamed: 0,name,service_type
10281,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
106457,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
6882,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
98,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"
30649,Laguna Niguel Senior Mobility Program,"[on-demand, NEMT]"


In [65]:
airtable3 = airtable2.explode("service_type").reset_index(drop=True)

In [66]:
airtable3.head()

Unnamed: 0,name,service_type
0,Topanga Beach Bus,fixed-route
1,St Pauls PACE,NEMT
2,Dodge Ridge Ski Bus,deviated fixed-route
3,Dodge Ridge Ski Bus,reservations
4,SacRT GO,ADA paratransit


In [67]:
airtable3.service_type = airtable3.service_type.fillna("NA")

In [68]:
# fixed route only
#airtable4 = airtable3[airtable3.service_type.str.lower().str.contains("fixed")]

In [69]:
airtable5 = (
    airtable3.drop_duplicates(["name"]).sort_values(by=["name"]).reset_index(drop=True)
)

In [70]:
len(airtable5)

995

In [71]:
airtable5.name.nunique()

994

In [72]:
airtable5 = airtable5.fillna('NA')

In [73]:
airtable5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          995 non-null    object
 1   service_type  995 non-null    object
dtypes: object(2)
memory usage: 15.7+ KB


### Merge

In [74]:
airtable5 = clean_organization_names(airtable5, "name")

  df[agency_col]
  to_correct[["name_pt1", "name_pt2"]] = to_correct[agency_col].str.split(


In [75]:
m2 = find_fuzzy_match(airtable5, m2, "name", "organization_name", "fuzzy_agency", 95)

In [76]:
for i in [
    "Eastern Contra Costa Transit Authority",
    "Fresno County",
    "Livermore Amador Valley Transit Authority",
    "Eastern Sierra Transit Authority",
]:
    m2.loc[m2["organization_name"].eq(i), "fuzzy_agency"] = None

In [77]:
m2.loc[m2.fuzzy_agency.notna()][["organization_name", "fuzzy_agency"]].sort_values(
    ["organization_name"]
)

Unnamed: 0,organization_name,fuzzy_agency
0,Alameda-Contra Costa Transit District,Alameda-Contra Costa Transit District
1,Alpine County Community Development,Alpine County
2,Alpine County Local Transportation Commission,Alpine County
3,Amador Transit,Amador Transit Dial-A-Ride
4,Antelope Valley Transit Authority,Antelope Valley Transit Authority
5,Arcadia City,Arcadia City
6,Arcata City,Arcata City
8,Arvin City,Arvin City
10,Auburn City,Auburn City
18,California City City,California City Dial-A-Ride


In [78]:
# Reverse -> replace
"""
for i in airtable5.name.unique().tolist():
       replace_matches_set_ratio(
        organizations, "organization_name", "project_name_fuzzy", i, 95) """

'\nfor i in airtable5.name.unique().tolist():\n       replace_matches_set_ratio(\n        organizations, "organization_name", "project_name_fuzzy", i, 95) '

In [79]:
# Fuzzy matches that didn't work
# Use organization name
fuzzy_matches_to_filter = [
    "Eastern Contra Costa Transit Authority",
    "Livermore Amador Valley Transit Authority",
    "Calaveras Transit",
    "City Of Corcoran - Corcoran Area Transit",
]

In [80]:
# These are the fuzzy matches that worked.
found_matches = (
    m2[((~m2.fuzzy_agency.isna()) & ~(m2.fuzzy_agency.isin(fuzzy_matches_to_filter)))]
).reset_index(drop=True)

In [81]:
found_matches.shape

(61, 9)

In [82]:
still_need_matches = m2[
    ~m2.organization_name.isin(found_matches.organization_name.tolist())
]

* Calaveras Transit change to Calaveras Connect
* Arvin City  Arvin Transit
* Auburn City Auburn Transit
* County Of Los Angeles - Department Of Public Works, Los Angeles County Transit Services
* 34	County Of Sacramento Department Of Transportation Sacrt Bus
* County Of Shasta Department Of Public Works
* Dinuba City Dinuba Connection
* Lassen Transit Service Lassen Rural Bus
* Needles City  Needles Area Transit
* Nevada Public Works County Nevada County Connects
* Ojai City Ojai Trolley
* Palo Verde Valley Transit Palos Verdes Peninsula Transit Authority
* Placer County Public Works Placer County Transit
* Plumas County Transportation Commission Plumas Transit Systems
* Porterville City  Porterville Transit
* Ridgecrest City Ridgecrest Transit
* Rio Vista City Rio Vista Delta Breeze
* Santa Maria City Santa Maria Regional Transit
* Siskiyou County Siskiyou Transit And General Express
* Stanislaus County Public Works - Transit Division  Stanislaus Regional Transit Authority
* Taft City Taft Area Transit
* Tehama County Transit  Tehama Rural Area Express
* Transportation Trinity County Department Trinity Transit
* Transit Joint Powers Authority For Merced County Merced The Bus
* Visalia City Visilia Transit
* Yolo County Transportation District Yolobus


In [83]:
to_map = {
    "Tulare County": "Tuolumne County Transit",
    "Turlock City": "Turlock Transit",
    "Union City City": "Union City Transit",
    "Calaveras Transit": "Calaveras Connect",
    "Alameda-Contra Costa Transit District": "Ac Transit",
    "Arcadia City": "Arcadia Transit",
    "Banning City": "Banning Pass Transit",
    "Beaumont City": "Beaumont Pass Transit",
    "Calaveras Council Of Governments": "Calaveras Connect",
    "Camarillo City": "Camarillo Area Transit",
    "Commerce City": "Commerce Municipal Bus Lines",
    "Corona City": "Corona Cruiser",
    "Delano City": "Delano Area Rapid Transit",
    "Eastern Sierra Transit Authority": "Eastern Sierra Transit Authority Community Routes",
    "Elk Grove City": "Elk Grove Transit Services",
    "Fairfield City": "Fairfield And Suisun Transit",
    "Folsom City": "Folsom Stage Line",
    "Glenn County": "Glenn Ride",
    "Guadalupe City": "Guadalupe Flyer",
    "Lassen County": "Lassen Rural Bus",
    "Marin County Transit District": "Marin Transit",
    "Madera County": "Madera Metro",
    "Mariposa County": "Mariposa Grove Shuttle",
    "Morro Bay City": "Morro Bay Transit",
    "Norwalk City": "Norwalk Transit System",
    "Roseville City": "Roseville Transit",
    "Sacramento Regional Transit District": "Sacramento Regional Transit District Bus",
    "San Diego City": "San Diego Trolley",
    "San Francisco City": "Muni Bus",
    "Santa Rosa City": "Santa Rosa Citybus",
    "Shafter City": "Shafter Dial-A-Ride",
    "Sierra County": "Sierra Point Shuttle",
    "Simi Valley City": "Simi Valley Transit",
    "Sonoma Marin Area Rail Transit": "Sonoma-Marin Area Rail Transit",
    "Arvin City": "Arvin Transit",
    "Auburn City": "Auburn Transit",
    "County Of Los Angeles - Department Of Public Works": "Los Angeles County Transit Services",
    "County Of Sacramento Department Of Transportation": "Sacrt Bus",
    "Dinuba City": "Dinuba Connection",
    "Lassen Transit Service": "Lassen Rural Bus",
    "Needles City": "Needles Area Transit",
    "Nevada Public Works": "County Nevada County Connects",
    "Ojai City": "Ojai Trolley",
    "Palo Verde Valley Transit": "Palos Verdes Peninsula Transit Authority",
    "Placer County Public Works": "Placer County Transit",
    "Plumas County Transportation Commission": "Plumas Transit Systems",
    "Porterville City": "Porterville Transit",
    "Ridgecrest City": "Ridgecrest Transit",
    "Rio Vista City": "Rio Vista Delta Breeze",
    "Santa Maria City": "Santa Maria Regional Transit",
    "Siskiyou County": "Siskiyou Transit And General Express",
    "Taft City": "Taft Area Transit",
    "Tehama County Transit": "Tehama Rural Area Express",
    "Transportation Trinity County Department": "Trinity Transit",
    "Transit Joint Powers Authority For Merced": "County Merced The Bus",
    "Visalia City": "Visilia Transit",
    "Yolo County Transportation District": "Yolobus",
}

In [84]:
still_need_matches.organization_name = still_need_matches.organization_name.replace(
    to_map
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  still_need_matches.organization_name = still_need_matches.organization_name.replace(


In [85]:
# Concat
blackcat_cleaned = pd.concat([found_matches, still_need_matches], axis=0)

In [86]:
len(m1)

242

In [87]:
len(blackcat_cleaned)

212

In [88]:
m3 = pd.merge(
    blackcat_cleaned,
    airtable5,
    how="left",
    left_on="organization_name",
    right_on="name",
)

In [89]:
m3.service_type = m3.service_type.fillna("no service info")

In [90]:
m3 = m3.drop(columns=["name"])

In [91]:
m3.service_type.value_counts()

no service info         133
fixed-route              58
NA                       10
deviated fixed-route      8
on-demand                 3
ADA paratransit           3
seasonal                  1
Name: service_type, dtype: int64

In [92]:
m3 = m3.fillna("NA")

In [93]:
final_subset = [
    "funding_program",
    "organization_name",
    "first_name",
    "last_name",
    "email",
    "phone",
    "title",
    "service_type",
]

In [94]:
m3 = m3[final_subset]

In [95]:
def delete_repeated_element(df, col: str):
    df[col] = (
        df[col]
        .apply(lambda x: ", ".join(set([y.strip() for y in x.split(",")])))
        .str.strip()
    )
    return df

In [96]:
m3.service_type = m3.service_type.str.title()

In [97]:
m3 = delete_repeated_element(m3, "funding_program")

In [98]:
m3 = m3.drop_duplicates(subset = ['organization_name']).sort_values(["organization_name"]).reset_index(drop=True)

In [99]:
m3.shape, m3.organization_name.nunique()

((210, 8), 210)

In [101]:
m3.to_excel(f"{gcs_path}5311_SGR_Recipients_5_24_23.xlsx")