## How much funding did each agency receive by each fiscal year and grant, in the past five years? 
* For Hubspot/CRM
* [Research Request](https://github.com/cal-itp/data-analyses/issues/333)

In [None]:
import calitp.magics
import pandas as pd
import utils 
from calitp import *
from calitp.tables import tbl
from siuba import *

# Formatting the notebook
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
df = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx",
        sheet_name="Grant Projects",
    )
)

### Filter out what I want 

In [None]:
# Grants wanted
grants_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [None]:
# Filter for grants
df2 = df[df["funding_program"].isin(grants_wanted)]

In [None]:
# Replace the different variations of 5311/5310 with broader names
df2["funding_program_broad"] = df2["funding_program"].replace(
    {
        "5310 Exp": "5310",
        "5310 Trad": "5310",
        "Section 5311": "5311",
        "5311(f) Cont": "5311",
        "CMAQ (FTA 5311)": "5311",
        "Section 5311(f)": "5311",
        "5311(f) Round 2": "5311",
    }
)

In [None]:
# Keep only certain columns
wanted_columns = [
    "grant_fiscal_year",
    "funding_program_broad",
    "grant_number",
    "project_year",
    "organization_name",
    "allocationamount",
]

In [None]:
# New subset dataframe
df3 = df2[wanted_columns]

In [None]:
# Only want data for last five years
df4 = df3[df3["grant_fiscal_year"] > 2017]

In [None]:
# Get a list of the unique organizations
original_orgs = set(df4.organization_name.unique().tolist())

In [None]:
len(original_orgs)

In [None]:
df4.shape

In [None]:
df4 = utils.organization_cleaning(df4, "organization_name")

In [None]:
# Group by to summarize information.
agg1 = (
    df4.groupby(
        [
            "organization_name",
            "funding_program_broad",
            "grant_fiscal_year",
        ]
    )
    .agg({"allocationamount": "sum"})
    .reset_index()
)

In [None]:
agg1.shape

In [None]:
agg1["organization_name"].nunique()

### Add the URL of each agency's website - Airtable

In [None]:
%%sql -o airtable_grants
SELECT 
  CAST(itp_id AS INT) AS itp_id,
  name,
  website
FROM cal-itp-data-infra.mart_transit_database.dim_organizations
WHERE itp_id IS NOT NULL
  AND itp_id > 0
    OR website IS NOT NULL
ORDER BY itp_id ASC

In [None]:
len(airtable_grants)

In [None]:
# Some orgs that are different share the same Cal ITP ID
# EX: RIverside University Health System and Redwood Coast Seniors
airtable_grants["itp_id"].nunique()

In [None]:
airtable_grants2 = airtable_grants.drop_duplicates(subset=["itp_id"])

In [None]:
len(airtable_grants2)

In [None]:
airtable_grants.info()

In [None]:
m1 = pd.merge(
    agg1,
    airtable_grants,
    how="outer",
    left_on=["organization_name"],
    right_on=["name"],
    indicator=True,
    validate="m:1",
)

In [None]:
m1["_merge"].value_counts()

In [None]:
m1 = m1.drop(columns=["_merge"])

In [None]:
# Change the Cal ITP ID to be integer to match m1's Cal ITP
m1["itp_id"] = m1["itp_id"].fillna(0).astype("int64")

In [None]:
len(m1)

In [None]:
m1["website"].nunique()

### Add URL with GTFS Schedule

In [None]:
gtfs_schedule = tbl.gtfs_schedule.agency() >> collect() >> distinct()

In [None]:
# Subset
gtfs_schedule2 = gtfs_schedule[
    ["calitp_itp_id", "agency_id", "agency_name", "agency_url"]
]

In [None]:
# There are duplicated Cal ITP IDs, delete duplicates
gtfs_schedule3 = gtfs_schedule2.drop_duplicates(subset=["calitp_itp_id"], keep="first")

In [None]:
# Check data types
gtfs_schedule3.dtypes, m1.dtypes

In [None]:
m2 = pd.merge(
    m1,
    gtfs_schedule3,
    how="left",
    left_on=["itp_id"],
    right_on=["calitp_itp_id"],
    indicator=True,
    validate="m:1",
)

In [None]:
len(m2)

### Clean up 

In [None]:
# Create a new col to populate websites from the two data sets
m2["website_use"] = m2["website"]

In [None]:
m2["website_use"] = m2["website_use"].fillna(m2["agency_url"])

In [None]:
m2_cols_to_keep = [
    "organization_name",
    "funding_program_broad",
    "grant_fiscal_year",
    "allocationamount",
    "website_use",
]

In [None]:
m3 = m2[m2_cols_to_keep]

In [None]:
# Did an outer join for airtable + black cat.
# Delete all the records that are only found in airtable
m4 = m3.dropna(subset=["organization_name", "allocationamount"])

In [None]:
m4["website_use"] = m4["website_use"].fillna("N/A")

### Aggregate Again

In [None]:
m4 = m4.rename(
    columns={
        "organization_name": "Organization",
        "website_use": "Website",
        "funding_program_broad": "Grant",
        "allocationamount": "Allocation",
        "grant_fiscal_year": "Grant Fiscal Year",
    }
)

In [None]:
# Group by to summarize information.
agg2 = m4.groupby(
    [
        "Organization",
        "Website",
        "Grant",
        "Grant Fiscal Year",
    ]
).agg({"Allocation": "sum"})

In [None]:
agg2.sample(4)

In [None]:
"""
with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/5311-5310/5311_5310_hubspot.xlsx"
) as writer:
    agg2.to_excel(writer, sheet_name="5311_5310_Applicants", index=True)
"""

## Airtable TEST

In [None]:
# Grab the funds I want into a list
airtable_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "5339 (National)",
    "5339 (State)",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [None]:
# Filter out for the funds I want
airtable = df[df["funding_program"].isin(airtable_wanted)]

In [None]:
# Check that all the grants are here 
airtable["funding_program"].value_counts()

In [None]:
# Filter out for projects that are later than 2018
airtable = airtable[airtable["project_year"] > 2018]

In [None]:
# Subset df into a smaller one: since we only care if an organization appeared in 
# a grant's dataframe at any point after 2018, we don't need the year/etc
airtable = airtable[["funding_program", "organization_name"]]

In [None]:
airtable.sample(3)

In [None]:
# Subset three dfs with for a specific grant
df_5311 = airtable[(airtable.funding_program.str.contains("5311", case=False))]

In [None]:
df_5310 = airtable[(airtable.funding_program.str.contains("5310", case=False))]

In [None]:
df_5339 = airtable[(airtable.funding_program.str.contains("5339", case=False))]

In [None]:
# Using a for loop,we can print out how many rows correspond with each "flavor" of the grant program
for i in [df_5311, df_5310, df_5339]:
    print(i["funding_program"].value_counts())
    print(len(i)) 

In [None]:
# First merge: merging 5311 and 5310 
m_5311_5310 = pd.merge(
    df_5311,
    df_5310,
    how="outer",
    on=["organization_name", "funding_program"],
    indicator=True,
)

In [None]:
# Check out the results 
m_5311_5310["_merge"].value_counts()

In [None]:
m_5311_5310.sample(10)

In [None]:
# Drop the duplicates of organization names. 
len(m_5311_5310), len(m_5311_5310.drop_duplicates(subset=["organization_name"]))

In [None]:
# Actually drop the duplicates of agency name, since the same agencies appear multiple times across the years
# Dropping a subset allows you to choose which column(s) to drop the duplicates of
# When you don't specify, this looks across all the columns of a df
m2_5311_5310 = m_5311_5310.drop_duplicates(subset=["organization_name"])

In [None]:
# Rename the merge column to something that is a little clearer 
m2_5311_5310 = m2_5311_5310.rename(columns = {'_merge': '5311_5310_overlap'}) 

In [None]:
# Replace right only/left only with clearer definitions 
m2_5311_5310["5311_5310_overlap"] = m2_5311_5310["5311_5310_overlap"].replace(
    {"left_only": "5311 only", "right_only": "5310 only", "both": "Both 5311 and 5310"}
)

In [None]:
# Sample a few rows 
m2_5311_5310.sample(3)

In [None]:
# Now merge in 5339 with the merged 5311 & 5310 stuff
m3_all = pd.merge(
    m2_5311_5310,
    df_5339,
    how="outer",
    on = ["organization_name"],
    indicator=True,
)

In [None]:
# Again drop the duplicates of organizations
m4 = m3_all.drop_duplicates(subset=["organization_name"])

In [None]:
m4["_merge"].value_counts()

In [None]:
# Look at organizations A-Z
m4.sort_values('_merge')

In [None]:
# Use a function to replace left_only and both 
# https://github.com/cal-itp/data-analyses/blob/main/grant_misc/A2_dla.ipynb
# df is the argument of the function
def recategorize(df):   
    if (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'Black Cat Only'):
        return 'BlackCat Only'
    elif (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'TIRCP_Only'):
        return 'TIRCP_Only'
    elif (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'Both in TIRCP and BlackCat'):
        return 'TIRCP and BlackCat'
    elif (df['_merge']=='left_only'):
        return 'DLA Only'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'TIRCP Only'):
        return 'TIRCP and DLA'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'Black Cat Only'):
        return 'BlackCat and DLA'
    else: 
        return "TIRCP, BlackCat and DLA"
    

In [None]:
# Apply a function along an axis of the DataFrame. 
# Axis = 1 means across each row of the df 
# Axis = 0 means across each column of the df 
m4['_merge'] = m4.apply(recategorize, axis = 1)

In [None]:
# Drop any columns you don't want 
m4 = m4.drop(columns = [all the columns you don't want]) 