## How much funding did each agency receive by each fiscal year and grant, in the past five years? 
* For Hubspot/CRM
* [Research Request](https://github.com/cal-itp/data-analyses/issues/333)

In [None]:
import calitp.magics
import pandas as pd
from calitp import *
from calitp.tables import tbl
from siuba import *

# Formatting the notebook
# The max columns to display will be 100
pd.options.display.max_columns = 100

# There will allow you to print all the rows in your data
pd.set_option("display.max_rows", None)

# This will prevent columns from being truncated
pd.set_option("display.max_colwidth", None)

In [None]:
df = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx",
        sheet_name="Grant Projects",
    )
)

In [None]:
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
    )
    return df

### Filter out what I want 

In [None]:
# Grants wanted
grants_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [None]:
# Filter for grants
df2 = df[df["funding_program"].isin(grants_wanted)]

In [None]:
# Replace the different variations of 5311/5310 with broader names
df2["funding_program_broad"] = df2["funding_program"].replace(
    {
        "5310 Exp": "5310",
        "5310 Trad": "5310",
        "Section 5311": "5311",
        "5311(f) Cont": "5311",
        "CMAQ (FTA 5311)": "5311",
        "Section 5311(f)": "5311",
        "5311(f) Round 2": "5311",
    }
)

In [None]:
# Keep only certain columns
wanted_columns = [
    "grant_fiscal_year",
    "funding_program_broad",
    "grant_number",
    "project_year",
    "organization_name",
    "allocationamount",
]

In [None]:
# New subset dataframe
df3 = df2[wanted_columns]

In [None]:
# Only want data for last five years
df4 = df3[df3["grant_fiscal_year"] > 2017]

In [None]:
# Get a list of the unique organizations
original_orgs = set(df4.organization_name.unique().tolist())

In [None]:
len(original_orgs)

In [None]:
df4.shape

In [None]:
df4 = organization_cleaning(df4, "organization_name")

In [None]:
# Group by to summarize information.
agg1 = (
    df4.groupby(
        [
            "organization_name",
            "funding_program_broad",
            "grant_fiscal_year",
        ]
    )
    .agg({"allocationamount": "sum"})
    .reset_index()
)

In [None]:
agg1.shape

In [None]:
agg1["organization_name"].nunique()

### Add the URL of each agency's website - Airtable

In [None]:
%%sql -o airtable_grants
SELECT 
  CAST(itp_id AS INT) AS itp_id,
  name,
  website
FROM cal-itp-data-infra.mart_transit_database.dim_organizations
WHERE itp_id IS NOT NULL
  AND itp_id > 0
    OR website IS NOT NULL
ORDER BY itp_id ASC

In [None]:
len(airtable_grants)

In [None]:
# Some orgs that are different share the same Cal ITP ID
# EX: RIverside University Health System and Redwood Coast Seniors
airtable_grants["itp_id"].nunique()

In [None]:
airtable_grants2 = airtable_grants.drop_duplicates(subset=["itp_id"])

In [None]:
len(airtable_grants2)

In [None]:
airtable_grants.info()

In [None]:
m1 = pd.merge(
    agg1,
    airtable_grants,
    how="outer",
    left_on=["organization_name"],
    right_on=["name"],
    indicator=True,
    validate="m:1",
)

In [None]:
m1["_merge"].value_counts()

In [None]:
m1 = m1.drop(columns=["_merge"])

In [None]:
# Change the Cal ITP ID to be integer to match m1's Cal ITP
m1["itp_id"] = m1["itp_id"].fillna(0).astype("int64")

In [None]:
len(m1)

In [None]:
m1["website"].nunique()

### Add URL with GTFS Schedule

In [None]:
gtfs_schedule = tbl.gtfs_schedule.agency() >> collect() >> distinct()

In [None]:
# Subset 
gtfs_schedule2 = gtfs_schedule[
    ["calitp_itp_id", "agency_id", "agency_name", "agency_url"]
]

In [None]:
# There are duplicated Cal ITP IDs, delete duplicates
gtfs_schedule3 = gtfs_schedule2.drop_duplicates(subset=["calitp_itp_id"], keep="first")

In [None]:
# Check data types
gtfs_schedule3.dtypes, m1.dtypes

In [None]:
m2 = pd.merge(
    m1,
    gtfs_schedule3,
    how="left",
    left_on=["itp_id"],
    right_on=["calitp_itp_id"],
    indicator=True,
    validate="m:1",
)

In [None]:
len(m2)

### Clean up 

In [None]:
# Create a new col to populate websites from the two data sets
m2["website_use"] = m2["website"]

In [None]:
m2["website_use"] = m2["website_use"].fillna(m2["agency_url"])

In [None]:
m2_cols_to_keep = [
    "organization_name",
    "funding_program_broad",
    "grant_fiscal_year",
    "allocationamount",
    "website_use",
]

In [None]:
m3 = m2[m2_cols_to_keep]

In [None]:
# Did an outer join for airtable + black cat. 
# Delete all the records that are only found in airtable
m4 = m3.dropna(subset=["organization_name", "allocationamount"])

In [None]:
m4["website_use"] = m4["website_use"].fillna("N/A")

### Aggregate Again

In [None]:
m4 = m4.rename(
    columns={
        "organization_name": "Organization",
        "website_use": "Website",
        "funding_program_broad": "Grant",
        "allocationamount": "Allocation",
        "grant_fiscal_year": "Grant Fiscal Year",
    }
)

In [None]:
# Group by to summarize information.
agg2 = m4.groupby(
    [
        "Organization",
        "Website",
        "Grant",
        "Grant Fiscal Year",
    ]
).agg({"Allocation": "sum"})

In [None]:
agg2.sample(4)

In [None]:
'''
with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/grants/5311_5310_hubspot.xlsx"
) as writer:
    agg2.to_excel(writer, sheet_name="5311_5310_Applicants", index=True)
''' 