## How much funding did each agency receive by each fiscal year and grant, in the past five years? 
* For Hubspot/CRM
* [Research Request](https://github.com/cal-itp/data-analyses/issues/333)

In [1]:
import calitp.magics
import pandas as pd
from calitp import *
from calitp.tables import tbl
from siuba import *

# Formatting the notebook
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)



In [2]:
df = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx",
        sheet_name="Grant Projects",
    )
)

### Filter out what I want 

In [4]:
# Grants wanted
grants_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [5]:
# Filter for grants
df2 = df[df["funding_program"].isin(grants_wanted)]

In [6]:
# Replace the different variations of 5311/5310 with broader names
df2["funding_program_broad"] = df2["funding_program"].replace(
    {
        "5310 Exp": "5310",
        "5310 Trad": "5310",
        "Section 5311": "5311",
        "5311(f) Cont": "5311",
        "CMAQ (FTA 5311)": "5311",
        "Section 5311(f)": "5311",
        "5311(f) Round 2": "5311",
    }
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["funding_program_broad"] = df2["funding_program"].replace(


In [7]:
# Keep only certain columns
wanted_columns = [
    "grant_fiscal_year",
    "funding_program_broad",
    "grant_number",
    "project_year",
    "organization_name",
    "allocationamount",
]

In [8]:
# New subset dataframe
df3 = df2[wanted_columns]

In [9]:
# Only want data for last five years
df4 = df3[df3["grant_fiscal_year"] > 2017]

In [10]:
# Get a list of the unique organizations
original_orgs = set(df4.organization_name.unique().tolist())

In [11]:
len(original_orgs)

180

In [12]:
df4.shape

(1329, 6)

In [13]:
df4 = utils.organization_cleaning(df4, "organization_name")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_wanted] = (


In [14]:
# Group by to summarize information.
agg1 = (
    df4.groupby(
        [
            "organization_name",
            "funding_program_broad",
            "grant_fiscal_year",
        ]
    )
    .agg({"allocationamount": "sum"})
    .reset_index()
)

In [15]:
agg1.shape

(532, 4)

In [16]:
agg1["organization_name"].nunique()

180

### Add the URL of each agency's website - Airtable

In [17]:
%%sql -o airtable_grants
SELECT 
  CAST(itp_id AS INT) AS itp_id,
  name,
  website
FROM cal-itp-data-infra.mart_transit_database.dim_organizations
WHERE itp_id IS NOT NULL
  AND itp_id > 0
    OR website IS NOT NULL
ORDER BY itp_id ASC

Unnamed: 0,itp_id,name,website
0,,Trinity County Transportation Commission,https://www.trinitycounty.org/Transportation-Commission
1,,Mono County Local Transportation Commission,https://monocounty.ca.gov/ltc
2,,Vivalon Inc.,https://vivalon.org/
3,,Stanislaus Council of Governments,http://www.stancog.org/
4,,Amador County Transportation Commission,actc-amador.org
5,,Tehama County Transportation Commission,https://tehamartpa.org/
6,,TripShot Inc.,https://tripshot.com
7,,Centum Adetel,https://www.adetelsolution.com/
8,,Friends Outside,http://www.friendsoutside.org/Programs-and-Services/Visitor-Centers
9,,City and County Association of Governments of San Mateo County,https://ccag.ca.gov/


In [18]:
len(airtable_grants)

528

In [19]:
# Some orgs that are different share the same Cal ITP ID
# EX: RIverside University Health System and Redwood Coast Seniors
airtable_grants["itp_id"].nunique()

370

In [20]:
airtable_grants2 = airtable_grants.drop_duplicates(subset=["itp_id"])

In [21]:
len(airtable_grants2)

371

In [22]:
airtable_grants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   itp_id   376 non-null    float64
 1   name     528 non-null    object 
 2   website  391 non-null    object 
dtypes: float64(1), object(2)
memory usage: 12.5+ KB


In [23]:
m1 = pd.merge(
    agg1,
    airtable_grants,
    how="outer",
    left_on=["organization_name"],
    right_on=["name"],
    indicator=True,
    validate="m:1",
)

In [24]:
m1["_merge"].value_counts()

right_only    432
both          339
left_only     193
Name: _merge, dtype: int64

In [25]:
m1 = m1.drop(columns=["_merge"])

In [26]:
# Change the Cal ITP ID to be integer to match m1's Cal ITP
m1["itp_id"] = m1["itp_id"].fillna(0).astype("int64")

In [27]:
len(m1)

964

In [28]:
m1["website"].nunique()

389

### Add URL with GTFS Schedule

In [29]:
gtfs_schedule = tbl.gtfs_schedule.agency() >> collect() >> distinct()

In [30]:
# Subset
gtfs_schedule2 = gtfs_schedule[
    ["calitp_itp_id", "agency_id", "agency_name", "agency_url"]
]

In [31]:
# There are duplicated Cal ITP IDs, delete duplicates
gtfs_schedule3 = gtfs_schedule2.drop_duplicates(subset=["calitp_itp_id"], keep="first")

In [32]:
# Check data types
gtfs_schedule3.dtypes, m1.dtypes

(calitp_itp_id     int64
 agency_id        object
 agency_name      object
 agency_url       object
 dtype: object,
 organization_name         object
 funding_program_broad     object
 grant_fiscal_year        float64
 allocationamount         float64
 itp_id                     int64
 name                      object
 website                   object
 dtype: object)

In [33]:
m2 = pd.merge(
    m1,
    gtfs_schedule3,
    how="left",
    left_on=["itp_id"],
    right_on=["calitp_itp_id"],
    indicator=True,
    validate="m:1",
)

In [34]:
len(m2)

964

### Clean up 

In [35]:
# Create a new col to populate websites from the two data sets
m2["website_use"] = m2["website"]

In [36]:
m2["website_use"] = m2["website_use"].fillna(m2["agency_url"])

In [37]:
m2_cols_to_keep = [
    "organization_name",
    "funding_program_broad",
    "grant_fiscal_year",
    "allocationamount",
    "website_use",
]

In [38]:
m3 = m2[m2_cols_to_keep]

In [39]:
# Did an outer join for airtable + black cat.
# Delete all the records that are only found in airtable
m4 = m3.dropna(subset=["organization_name", "allocationamount"])

In [40]:
m4["website_use"] = m4["website_use"].fillna("N/A")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m4["website_use"] = m4["website_use"].fillna("N/A")


### Aggregate Again

In [41]:
m4 = m4.rename(
    columns={
        "organization_name": "Organization",
        "website_use": "Website",
        "funding_program_broad": "Grant",
        "allocationamount": "Allocation",
        "grant_fiscal_year": "Grant Fiscal Year",
    }
)

In [42]:
# Group by to summarize information.
agg2 = m4.groupby(
    [
        "Organization",
        "Website",
        "Grant",
        "Grant Fiscal Year",
    ]
).agg({"Allocation": "sum"})

In [43]:
agg2.sample(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Allocation
Organization,Website,Grant,Grant Fiscal Year,Unnamed: 4_level_1
City of Wasco,,5311,2022.0,185124.0
Imperial County Transportation Commission,https://www.imperialctc.org/transit-services,5311,2022.0,538038.0
San Benito County Local Transportation Authority,http://www.sanbenitocountyexpress.org/,5311,2018.0,304997.0
City of Chowchilla,,5311,2022.0,94175.0


In [44]:

with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_hubspot.xlsx"
) as writer:
    agg2.to_excel(writer, sheet_name="5311_5310_Applicants", index=True)


## Airtable

In [45]:
airtable_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "5339 (National)",
    "5339 (State)",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [46]:
airtable = df[df["funding_program"].isin(airtable_wanted)]

In [47]:
airtable["funding_program"].value_counts()

5310 Trad          986
Section 5311       720
5310 Exp           166
Section 5311(f)    140
5339 (State)       129
5339 (National)     48
CMAQ (FTA 5311)     44
5311(f) Cont        41
5311(f) Round 2     27
Name: funding_program, dtype: int64

In [48]:
airtable = airtable[airtable["project_year"] > 2018]

In [49]:
airtable = airtable[["funding_program", "organization_name"]]

In [50]:
airtable.sample(3)

Unnamed: 0,funding_program,organization_name
1488,5310 Trad,"Pride Industries One, Inc."
1589,5310 Trad,United Cerebral Palsy of San Luis Obispo County
2521,Section 5311,San Joaquin Regional Transit District


In [51]:
df_5311 = airtable[(airtable.funding_program.str.contains("5311", case=False))]

In [52]:
df_5310 = airtable[(airtable.funding_program.str.contains("5310", case=False))]

In [53]:
df_5339 = airtable[(airtable.funding_program.str.contains("5339", case=False))]

In [54]:
for i in [df_5311, df_5310, df_5339]:
    print(i["funding_program"].value_counts())
    print(len(i)) 

Section 5311       416
Section 5311(f)    112
5311(f) Round 2     27
CMAQ (FTA 5311)     24
Name: funding_program, dtype: int64
579
5310 Trad    547
5310 Exp      88
Name: funding_program, dtype: int64
635
5339 (State)       98
5339 (National)    30
Name: funding_program, dtype: int64
128


In [55]:
m_5311_5310 = pd.merge(
    df_5311,
    df_5310,
    how="outer",
    on=["organization_name"],
    indicator=True,
)

In [56]:
m_5311_5310["_merge"].value_counts()

both          1038
right_only     534
left_only      374
Name: _merge, dtype: int64

In [57]:
len(m_5311_5310.drop_duplicates(subset=["organization_name"]))

177

In [58]:
m2_5311_5310 = m_5311_5310.drop_duplicates(subset=["organization_name"])

In [59]:
m2_5311_5310["organization_name"].nunique()

177

In [60]:
m2_5311_5310["5311_5310_overlap"] = m2_5311_5310["_merge"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m2_5311_5310["5311_5310_overlap"] = m2_5311_5310["_merge"]


In [61]:
m2_5311_5310 = m2_5311_5310[["5311_5310_overlap", "organization_name"]]

In [62]:
m2_5311_5310["5311_5310_overlap"] = m2_5311_5310["5311_5310_overlap"].replace(
    {"right_only": "5310 only", "left_only": "5311 only", "both": "Both 5311 and 5310"}
)

In [63]:
m2_5311_5310.sample(3)

Unnamed: 0,5311_5310_overlap,organization_name
848,Both 5311 and 5310,Sunline Transit Agency
1927,5310 only,Victor Valley Community Services Council
1033,5311 only,City of Ridgecrest


In [64]:
m2_5311_5310["5311_5310_overlap"].value_counts()

5310 only             88
5311 only             65
Both 5311 and 5310    24
Name: 5311_5310_overlap, dtype: int64

In [65]:
# Save 5310-5311 applicants
m2_5311_5310.to_parquet(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_applicants.parquet"
)

In [66]:
m3_all = pd.merge(
    m2_5311_5310,
    df_5339,
    how="outer",
    on=["organization_name"],
    indicator=True,
)

In [67]:
m4 = m3_all.drop_duplicates(subset=["organization_name"])

In [68]:
len(m4)

177

In [69]:
m4.sort_values(by=["organization_name"]).head()

Unnamed: 0,5311_5310_overlap,organization_name,funding_program,_merge
178,5310 only,ARC Imperial Valley,,left_only
217,5310 only,Alegria Community Living,,left_only
86,5311 only,Alpine County Community Development,,left_only
87,Both 5311 and 5310,Amador Transit,5339 (State),both
177,5310 only,"Angel View, Inc.",,left_only


In [70]:
m4["_merge"].value_counts()

left_only     152
both           25
right_only      0
Name: _merge, dtype: int64