# 5311 and 5310 Applicants
* [Research Request](https://github.com/cal-itp/data-analyses/issues/333)

In [None]:
# Packages to import
# Pandas is the full name of the package but call it pd for short.
import calitp.magics
import pandas as pd
from calitp import *
from calitp import query_sql

# You only need to import these if you want to use something from the warehouse
from calitp.tables import tbl
from siuba import *

# Formatting the notebook
# The max columns to display will be 100
pd.options.display.max_columns = 100

# There will allow you to print all the rows in your data
pd.set_option("display.max_rows", None)

# This will prevent columns from being truncated
pd.set_option("display.max_colwidth", None)

## Load the Excel Sheet
* Can read the original Excel workbook by the specific sheet you want. 
* Save your sheet as a Pandas dataframe - it can be called anything, but usually it's <i>something_df</i>. 
    * Dataframe = basically jsut a table of data. 
    * If you want to open multiple sheets, you'd assign them to different objects and different names. 
* "to_snakecase" changes the column names to all lowercases and replaces any spaces with _.

In [None]:
df = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx",
        sheet_name="Grant Projects",
    )
)

In [None]:
"""
If there are columns that SHOULD be an integer but isn't: input them into the list
after this for loop. This strips empty $ and commas from columns, 
then changes them to the data type of int.
"""
for c in ["column_one", "column_two", "column_three"]:
    df[c] = df[c].str.replace("$", "").str.replace(",", "").astype(int)

### Beware of duplicate values
* Grants data might be manually entered by multiple people. As such, values can be inconsistent. 
* BART, Bay Area Rapid Transit, and Bay Area Rapid Transit (BART) are all the same agency. 
* However, if you are counting the number of unique agencies, these would be counted as 3 different agencies, which is inaccurate.


In [None]:
# Check out your agencies and see if there are any duplicates by
# sorting your column of agencies from A-Z and seeing only unique ones
df["column"].sort_values().unique()

In [None]:
# Check out total nunique values
df["column"].nunique()

In [None]:
"""
If there are duplicate values, you can replace them with an existing one with a dictionary
If this cell is irrelevant,  go up to the top where it says "code" and change it to "markdown". 
You can also move the three quotation marks at the bottom of this cell to comment out the code.
If all the agencies are only listed once.
"""
df["column"] = df["column"].replace(
    {"old value 1": "correct value 1", "old value 2": "correct value 2"}
)

## Filter what you want
* You don't necessarily want all the years, all the programs, etc. 
* Filter out what you are interested in.

### Grants you want

In [None]:
# Paste whatever values you want between the brckets.
# The values need to be in quotes.
grants_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [None]:
"""
Keep only the grants in my list and create a NEW variable.
It's best to create new variables when you make changes, so you can always reference
the original variable. 
"""
df2 = df[df["funding_program"].isin(grants_wanted)]

In [None]:
df2["funding_program"].value_counts()

In [None]:
df2["funding_program_broad"] = df2["funding_program"].replace(
    {
        "5310 Exp": "5310",
        "5310 Trad": "5310",
        "Section 5311": "5311",
        "5311(f) Cont": "5311",
        "CMAQ (FTA 5311)": "5311",
        "Section 5311(f)": "5311",
        "5311(f) Round 2": "5311",
    }
)

### Columns you want
* Drop irrelvant columns 

In [None]:
# Copy and paste the irrelevant ones into this list below
wanted_columns = [
    "grant_fiscal_year",
    "funding_program_broad",
    "grant_number",
    "project_year",
    "organization_name",
    "allocationamount",
]

In [None]:
# Drop them - assign to a new dataframe if you wish
df3 = df2[wanted_columns]

In [None]:
df3.shape

## How much funding did each agency receive by each fiscal year and grant, in the past five years? 
* For Hubspot/CRM

In [None]:
"""
Filter out years again with the original dataframe, df2
Because this  the request is for a different time frame
"""
df4 = df3[df3["grant_fiscal_year"] > 2017]

In [None]:
df4["organization_name"].nunique()

In [None]:
df4.shape

#### Add the URL of each agency's website - GTFS agency info

In [None]:
"""
Query agency info from our warehouse
"""
agency_info = tbl.gtfs_schedule.agency() >> collect() >> distinct()

In [None]:
# keep only rows I want
agency_info = agency_info[["calitp_itp_id", "agency_id", "agency_name", "agency_url"]]

In [None]:
agency_info2 = agency_info.drop_duplicates(
    subset=["agency_name"], keep = 'first'
)

In [None]:
agency_info2.head(3)

In [None]:
agency_info2.dtypes

In [None]:
len(agency_info2)

In [None]:
m1 = pd.merge(
    df4,
    agency_info2,
    how="left",
    left_on=["organization_name"],
    right_on=["agency_name"],
    indicator=True,
)

In [None]:
m1["_merge"].value_counts()

In [None]:
m1 = m1.drop(columns = '_merge')

In [None]:
m1['agency_url'].nunique()

In [None]:
m1.shape

#### Airtable Organization Data

In [None]:
ntd_info  = to_snakecase(pd.read_csv("gs://calitp-analytics-data/data-analyses/grants/organizations-All Organizations (1).csv"))

In [None]:
ntd_info2 = ntd_info[['name','website', "itp_id"]]

In [None]:
ntd_info2.head(2)

In [None]:
ntd_info2['name'].nunique()

In [None]:
ntd_info2['itp_id'] = ntd_info2['itp_id'].fillna(0).astype('int64')

In [None]:
ntd_info2.dtypes

In [None]:
m1['calitp_itp_id'] = m1['calitp_itp_id'].fillna(0).astype('int64')

In [None]:
m1.dtypes

In [None]:
'''
m3 = pd.merge(
    m1,
    ntd_info2,
    how="left",
    left_on=["calitp_itp_id"],
    right_on=["itp_id"],
    indicator=True,
)
'''

In [None]:
m2 = pd.merge(
    m1,
    ntd_info2,
    how="left",
    left_on=["organization_name"],
    right_on=["name"],
    indicator=True,
)

In [None]:
m2['_merge'].value_counts()

In [None]:
m2['website_use'] = m2['website']

In [None]:
m2['website_use'] = m2['website_use'].fillna(m2['agency_url'])

In [None]:
m2['website_use'].nunique()

In [None]:
# fill na
m2_na_cols = ["calitp_itp_id", "agency_id", "agency_name", "website_use"]

In [None]:
m2[m2_na_cols] = m2[m2_na_cols].fillna("N/A")

In [None]:
m2.shape

In [None]:
m3 = pd.melt(m2, id_vars=['grant_fiscal_year', 'organization_name', 'funding_program_broad','website_use','calitp_itp_id'], value_vars=['allocationamount',])

In [None]:
m3 = m3.drop(columns = 'variable')

In [None]:
m3.head(3)

### Summarize  dataframe
* This is exactly the same as creating a pivot table in Excel. 
* There are certain columns you want to group & ony need once and certain columns you want to get the sum/median/mean/etc of.
* [Tutorial](https://docs.calitp.org/data-infra/analytics_new_analysts/data-analysis-intro.html#aggregating)

In [None]:
"""
Once you are happy with your analysis, assign it to a 
new variable such as agg1 = df3.groupby().
When you don't assign something to a variable, the results
aren't saved.
"""
agg1 = m2.groupby(
    [
        "organization_name",
        "website_use",
        "funding_program_broad",
        "grant_fiscal_year",
    ]
).agg({"allocationamount": "sum"})

In [None]:
agg1.head()

In [None]:
agg2 = agg1.reset_index()

In [None]:
agg2.columns

In [None]:
agg2.head()

## Save your work
* You can save all your hardwork into a single Excel workbook to our [Google Cloud Storage](https://console.cloud.google.com/storage/browser/calitp-analytics-data/data-analyses/grants;tab=objects?project=cal-itp-data-infra&prefix=&forceOnObjectsSortingFiltering=false).

# This will be saved to our GCS bucket.
with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/grants/5311_5310_hubspot.xlsx"
) as writer:
    agg1.to_excel(writer, sheet_name="5311_5310_Applicants", index= True)