# 5311 and 5310 Applicants
* [Research Request](https://github.com/cal-itp/data-analyses/issues/333)

In [1]:
# Packages to import
# Pandas is the full name of the package but call it pd for short.
import pandas as pd
from calitp import *

# Formatting the notebook
# The max columns to display will be 100
pd.options.display.max_columns = 100

# There will allow you to print all the rows in your data
pd.set_option("display.max_rows", None)

# This will prevent columns from being truncated
pd.set_option("display.max_colwidth", None)

## Load the Excel Sheet
* Can read the original Excel workbook by the specific sheet you want. 
* Save your sheet as a Pandas dataframe - it can be called anything, but usually it's <i>something_df</i>. 
    * Dataframe = basically jsut a table of data. 
    * If you want to open multiple sheets, you'd assign them to different objects and different names. 
* "to_snakecase" changes the column names to all lowercases and replaces any spaces with underescores.

In [2]:
df = to_snakecase(
   pd.read_excel("gs://calitp-analytics-data/data-analyses/grants/Grant+Projects_7_30_2022.xlsx", sheet_name="Grant Projects")
 )

# df = pd.read_excel("./Grant+Projects_7_30_2022.xlsx")

In [3]:
# Save your dataframe to the folder you are in
# df.to_excel("./Grant+Projects_7_30_2022.xlsx", index=False)

## Explore the data 
* Let's check out our data by answering questions such as
    * How many columns and rows does it have? 
    * How many missing values are there? 
    * What are the mean/median? 
* Any time you want to do something to your data, chain the function after the object.
    * In Excel, you'd do SUM(column you want)
    * In Pandas, you'd do df['column you want'].sum()
* [Resource](https://pandas.pydata.org/docs/user_guide/basics.html)    

In [4]:
# Check out the first five rows
# Any line with a pound symbol in front is a comment and won't be rendered
df.head()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
0,2011,Section 5311,CA-18-X047 | 0012000083,2016,City of Chowchilla,BCG0000228,Operating Assistance,300902,64BO17-00368,53221.0,114511.0,53221.0,0.0,0,Open,,,
1,2011,Section 5311,CA-18-X047 | 0012000083,2016,Madera County,BCG0000283,Buy <30-Ft Bus For Expansion,111304,64BC17-00408,110663.0,110663.0,101352.02,9310.98,0,Open,,,
2,2011,Section 5311,CA-18-X047 | 0012000083,2016,Madera County,BCG0000284,Purchase Replacement Van,111215,64BC17-00408,20643.0,44265.0,20643.0,0.0,0,Open,,,
3,2012,Section 5311,CA-18-X052 | 0012000304,2016,Madera County,BCG0000284,Purchase Replacement Van,111215,64BC17-00408,23622.0,44265.0,22868.3,753.7,0,Open,,,
4,2012,Section 5311,CA-18-X052 | 0012000304,2016,Madera County,BCG0000286,Purchase Expansion <30ft Bus,111304,64BC17-00480,22925.0,113319.0,22655.51,269.49,0,Open,,,


In [5]:
# Check out the last five rows
df.tail()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
2760,2022,Section 5311(f),TBD | 0022000356-F,2022,Sunline Transit Agency,BCG0003870,Operating Assistance Sliding Scale - 5311(f) - Route10,300902,,257375.0,0.0,0.0,257375.0,0,Open,,,
2761,2022,Section 5311(f),TBD | 0022000356-F,2022,Trinity County Department of Transportation,BCG0003993,Operating Assistance Sliding Scale RED/LEW 22/23,300902,,173820.0,0.0,0.0,173820.0,0,Open,,,
2762,2022,Section 5311(f),TBD | 0022000356-F,2022,Trinity County Department of Transportation,BCG0003997,Operating Assistance Sliding Scale WC 22/23,300902,,152038.0,0.0,0.0,152038.0,0,Open,,,
2763,2022,Section 5311(f),TBD | 0022000356-F,2022,Yosemite Area Regional Transportation System,BCG0004056,Operating Assistance Sliding Scale,300902,,300000.0,0.0,0.0,300000.0,0,Open,,,
2764,2022,Section 5311(f),TBD | 0022000356-F,2022,Yurok Tribe Transit,BCG0004031,Operating Assistance Sliding Scale - Orleans to Willow Creek,300902,,116064.0,0.0,0.0,116064.0,0,Open,,,


In [6]:
# Check out how many rows and columns, # of null values in each column, and the data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2765 entries, 0 to 2764
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   grant_fiscal_year    2765 non-null   int64  
 1   funding_program      2765 non-null   object 
 2   grant_number         2765 non-null   object 
 3   project_year         2765 non-null   int64  
 4   organization_name    2765 non-null   object 
 5   upin                 2765 non-null   object 
 6   description          2765 non-null   object 
 7   ali                  2765 non-null   object 
 8   contract_number      2498 non-null   object 
 9   allocationamount     2765 non-null   float64
 10  encumbered_amount    2765 non-null   float64
 11  expendedamount       2765 non-null   float64
 12  activebalance        2765 non-null   float64
 13  closedoutbalance     2765 non-null   int64  
 14  project_status       2765 non-null   object 
 15  project_closed_by    0 non-null      f

In [7]:
# The data goes spans between 2011 to 2022. Check out how many projects were funded by year.
# df["column 1"].value_counts()

In [8]:
# Not sure what a function does: use help
help(sum)

Help on built-in function sum in module builtins:

sum(iterable, /, start=0)
    Return the sum of a 'start' value (default: 0) plus an iterable of numbers
    
    When the iterable is empty, return the start value.
    This function is intended specifically for use with numeric values and may
    reject non-numeric types.



In [9]:
# Get some basic stats
df.describe()

Unnamed: 0,grant_fiscal_year,project_year,allocationamount,encumbered_amount,expendedamount,activebalance,closedoutbalance,project_closed_by,project_closed_date,project_closed_time
count,2765.0,2765.0,2765.0,2765.0,2765.0,2765.0,2765.0,0.0,0.0,0.0
mean,2018.832911,2018.893309,171868.5,213877.0,97967.23,73992.8,0.0,,,
std,1.73839,1.65946,271953.1,428568.8,177828.7,233518.1,0.0,,,
min,2011.0,2016.0,-1526830.0,0.0,0.0,-1526830.0,0.0,,,
25%,2017.0,2017.0,44259.0,56000.0,0.0,0.0,0.0,,,
50%,2019.0,2019.0,73000.0,81200.0,34673.12,3871.84,0.0,,,
75%,2020.0,2020.0,206903.0,243503.0,104553.0,64960.0,0.0,,,
max,2022.0,2022.0,4583490.0,10197510.0,1728208.0,3433762.0,0.0,,,


## Clean up
* [Tutorial](https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html)

### Data type is important. 
* If you have a column of monetary values presented as $139, 293.92 and you want to find the mean, this won't work. 
* This column is considered an "object" column due to the dollar sign and comma - same way as if you typed "caltrans".
    * You'll have make sure it's an integer.
* Based on df.info() clean up other columns that aren't the right data type

In [10]:
"""
If there are columns that SHOULD be an integer but isn't: input them into the list
after this for loop. This strips empty $ and commas in the columns you list, 
then changes them to the data type of int.

for c in ["column_one", "column_two", "column_three"]:
    df[c] = df[c].str.replace("$", "").str.replace(",", "").astype(int)
"""

'\nIf there are columns that SHOULD be an integer but isn\'t: input them into the list\nafter this for loop. This strips empty $ and commas in the columns you list, \nthen changes them to the data type of int.\n\nfor c in ["column_one", "column_two", "column_three"]:\n    df[c] = df[c].str.replace("$", "").str.replace(",", "").astype(int)\n'

### Beware of duplicate values
* Grants data might be manually entered by multiple people. As such, values can be inconsistent. 
* BART, Bay Area Rapid Transit, and Bay Area Rapid Transit (BART) are all the same agency. 
* However, if you are counting the number of unique agencies, these would be counted as 3 different agencies, which is inaccurate.


In [11]:
# Check out your agencies and see if there are any duplicates by
# sorting your column of agencies from A-Z and seeing only unique ones
# df["column"].sort_values().unique()

In [12]:
# Check out total nunique values
# df["column"].nunique()

In [13]:
"""
If there are duplicate values, you can replace them with an existing one with a dictionary
If this cell is irrelevant,  go up to the top where it says "code" and change it to "markdown". 
You can also move the three quotation marks at the bottom of this cell to comment out the code.
If all the agencies are only listed once.

df["column"] = df["column"].replace(
    {"old value 1": "correct value 1", "old value 2": "correct value 2"}
)

"""

'\nIf there are duplicate values, you can replace them with an existing one with a dictionary\nIf this cell is irrelevant,  go up to the top where it says "code" and change it to "markdown". \nYou can also move the three quotation marks at the bottom of this cell to comment out the code.\nIf all the agencies are only listed once.\n\ndf["column"] = df["column"].replace(\n    {"old value 1": "correct value 1", "old value 2": "correct value 2"}\n)\n\n'

## Filter what you want
* You don't necessarily want all the years, all the programs, etc. 
* Filter out what you are interested in.

### Grants you want

In [14]:
"""
Create a list that contains the grants you are interested in. 
A list is great because you can go in and delete/add items. 
Line below makes it easy to grab the values.
"""
df["funding_program"].unique()

array(['Section 5311', '5310 Exp', '5310 Trad', '5311(f) Cont',
       '5339 (National)', '5339 (State)', 'CMAQ (FTA 5311)',
       'Section 5311(f)', 'Toll Credits', '5311(f) Round 2', 'CARES Act',
       'CARES Act (F)', 'ARPA', 'CRRSAA'], dtype=object)

In [15]:
# Paste whatever values you want between the brckets.
# The values need to be in quotes.
grants_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "5339 (National)",
    "5339 (State)",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [16]:
"""
Keep only the grants in my list and create a NEW variable.
It's best to create new variables when you make changes, so you can always reference
the original variable. 
"""
df2 = df[df["funding_program"].isin(grants_wanted)]

### Columns you want
* Drop irrelvant columns 

In [17]:
df2["funding_program"].value_counts()

5310 Trad          986
Section 5311       720
5310 Exp           166
Section 5311(f)    140
5339 (State)       129
5339 (National)     48
CMAQ (FTA 5311)     44
5311(f) Cont        41
5311(f) Round 2     27
Name: funding_program, dtype: int64

In [18]:
# List out all your columns
df2.columns

Index(['grant_fiscal_year', 'funding_program', 'grant_number', 'project_year',
       'organization_name', 'upin', 'description', 'ali', 'contract_number',
       'allocationamount', 'encumbered_amount', 'expendedamount',
       'activebalance', 'closedoutbalance', 'project_status',
       'project_closed_by', 'project_closed_date', 'project_closed_time'],
      dtype='object')

In [19]:
df2.head()

Unnamed: 0,grant_fiscal_year,funding_program,grant_number,project_year,organization_name,upin,description,ali,contract_number,allocationamount,encumbered_amount,expendedamount,activebalance,closedoutbalance,project_status,project_closed_by,project_closed_date,project_closed_time
0,2011,Section 5311,CA-18-X047 | 0012000083,2016,City of Chowchilla,BCG0000228,Operating Assistance,300902,64BO17-00368,53221.0,114511.0,53221.0,0.0,0,Open,,,
1,2011,Section 5311,CA-18-X047 | 0012000083,2016,Madera County,BCG0000283,Buy <30-Ft Bus For Expansion,111304,64BC17-00408,110663.0,110663.0,101352.02,9310.98,0,Open,,,
2,2011,Section 5311,CA-18-X047 | 0012000083,2016,Madera County,BCG0000284,Purchase Replacement Van,111215,64BC17-00408,20643.0,44265.0,20643.0,0.0,0,Open,,,
3,2012,Section 5311,CA-18-X052 | 0012000304,2016,Madera County,BCG0000284,Purchase Replacement Van,111215,64BC17-00408,23622.0,44265.0,22868.3,753.7,0,Open,,,
4,2012,Section 5311,CA-18-X052 | 0012000304,2016,Madera County,BCG0000286,Purchase Expansion <30ft Bus,111304,64BC17-00480,22925.0,113319.0,22655.51,269.49,0,Open,,,


In [20]:
# Copy and paste the irrelevant ones into this list below
unwanted_columns = [
    "grant_number",
    "upin",
    "description",
    "ali",
    "contract_number",
    "allocationamount",
    "encumbered_amount",
    "expendedamount",
    "activebalance",
    "closedoutbalance",
    "project_closed_by",
    "project_closed_date",
    "project_closed_time",
]

In [21]:
# Drop them - assign to a new dataframe if you wish
df2 = df2.drop(columns=unwanted_columns)

In [22]:
# Check out your hard work with 5 random rows. Is this what you want?
df2.sample(5)

Unnamed: 0,grant_fiscal_year,funding_program,project_year,organization_name,project_status
1586,2019,5310 Trad,2019,United Cerebral Palsy of San Luis Obispo County,Open
63,2016,Section 5311,2016,Glenn County Transportation Commission,Open
1337,2019,5310 Trad,2019,Mobility Matters,Open
2598,2022,5339 (State),2022,Tulare County Regional Transportation Agency,Open
582,2017,5310 Trad,2017,"United Cerebral Palsy Association of Greater Sacramento, Inc.",Open


In [23]:
"""
Filter out for years. Check the data type of the column you are filtering on. 
Perhaps years will need quotes because it's an object or maybe it's an integer, so 
no quotes are necessary.
"""
df3 = df2[df2["project_year"] > 2018]

In [24]:
"""
Filter out for only 5311. 
This ignores the case, so 'ac transit' and 'AC TRANSIT' will show up.
"""
df_5311 = df3[(df3.funding_program.str.contains("5311", case=False))]

In [25]:
df_5311["funding_program"].value_counts()

Section 5311       416
Section 5311(f)    112
5311(f) Round 2     27
CMAQ (FTA 5311)     24
Name: funding_program, dtype: int64

In [26]:
# Check out the length, aka # of rows after filtering
len(df_5311)

579

In [27]:
# Repeat same steps for 5310, make sure to cast this into a different dataframe

In [28]:
df_5310 = df3[(df3.funding_program.str.contains("5310", case=False))]

In [29]:
df_5310["funding_program"].value_counts()

5310 Trad    547
5310 Exp      88
Name: funding_program, dtype: int64

In [30]:
df_5339 = df3[(df3.funding_program.str.contains("5339", case=False))]

In [31]:
df_5339["funding_program"].value_counts()

5339 (State)       98
5339 (National)    30
Name: funding_program, dtype: int64

In [32]:
len(df3)

1342

In [33]:
len(df_5310) + len(df_5311) + len(df_5339)

1342

In [34]:
len(df3) == (len(df_5310) + len(df_5311) + len(df_5339))

True

In [35]:
my_common_cols = df_5311.columns.tolist()

In [36]:
my_common_cols

['grant_fiscal_year',
 'funding_program',
 'project_year',
 'organization_name',
 'project_status']

## Airtable

#### Merge the split dataframes together
<img src= "download.jfif"> 

In [37]:
# Grab the funds I want into a list
airtable_wanted = [
    "Section 5311",
    "5310 Exp",
    "5310 Trad",
    "5311(f) Cont",
    "5339 (National)",
    "5339 (State)",
    "CMAQ (FTA 5311)",
    "Section 5311(f)",
    "5311(f) Round 2",
]

In [38]:
# Filter out for the funds I want
airtable = df[df["funding_program"].isin(airtable_wanted)]

In [39]:
# Check that all the grants are here 
airtable["funding_program"].value_counts()

5310 Trad          986
Section 5311       720
5310 Exp           166
Section 5311(f)    140
5339 (State)       129
5339 (National)     48
CMAQ (FTA 5311)     44
5311(f) Cont        41
5311(f) Round 2     27
Name: funding_program, dtype: int64

In [40]:
# Filter out for projects that are later than 2018
airtable = airtable[airtable["project_year"] > 2018]

In [41]:
# Subset df into a smaller one: since we only care if an organization appeared in 
# a grant's dataframe at any point after 2018, we don't need the year/etc
airtable = airtable[["funding_program", "organization_name"]]

In [42]:
airtable.sample(3)

Unnamed: 0,funding_program,organization_name
1061,5310 Exp,"Desert Access and Mobility, Inc"
1107,5310 Exp,The Center for Independent Living
1526,5310 Trad,Self-Help for the Elderly


In [43]:
len(airtable)

1342

In [44]:
# Subset three dfs with for a specific grant
df_5311 = airtable[(airtable.funding_program.str.contains("5311", case=False))]

In [45]:
df_5310 = airtable[(airtable.funding_program.str.contains("5310", case=False))]

In [46]:
df_5339 = airtable[(airtable.funding_program.str.contains("5339", case=False))]

In [47]:
# Using a for loop,we can print out how many rows correspond with each "flavor" of the grant program
for i in [df_5311, df_5310, df_5339]:
    print(i["funding_program"].value_counts())
    print(len(i)) 

Section 5311       416
Section 5311(f)    112
5311(f) Round 2     27
CMAQ (FTA 5311)     24
Name: funding_program, dtype: int64
579
5310 Trad    547
5310 Exp      88
Name: funding_program, dtype: int64
635
5339 (State)       98
5339 (National)    30
Name: funding_program, dtype: int64
128


In [48]:
# First merge: merging 5311 and 5310 
m_5311_5310 = pd.merge(
    df_5311,
    df_5310,
    how="outer",
    on=["organization_name"],
    indicator=True,
)

In [49]:
# Check out the results 
m_5311_5310["_merge"].value_counts()

both          1038
right_only     534
left_only      374
Name: _merge, dtype: int64

In [50]:
# Drop the duplicates of organization names. 
len(m_5311_5310), len(m_5311_5310.drop_duplicates(subset=["organization_name"]))

(1946, 177)

In [60]:
# Actually drop the duplicates of agency name, since the same agencies appear multiple times across the years
# Dropping a subset allows you to choose which column(s) to drop the duplicates of
# When you don't specify, this looks across all the columns of a df
m2_5311_5310 = m_5311_5310.drop_duplicates(subset=["organization_name"])

In [56]:
m_5311_53102['_merge'].value_counts()

right_only    88
left_only     65
both          24
Name: _merge, dtype: int64

In [61]:
# Rename the merge column to something that is a little clearer 
m2_5311_5310 = m2_5311_5310.rename(columns = {'_merge': '5311_5310_overlap'}) 

In [62]:
# Replace right only/left only with clearer definitions 
m2_5311_5310["5311_5310_overlap"] = m2_5311_5310["5311_5310_overlap"].replace(
    {"left_only": "5311 only", "right_only": "5310 only", "both": "Both 5311 and 5310"}
)

In [64]:
# Sample a few rows 
m2_5311_5310.sample(40)

Unnamed: 0,funding_program_x,organization_name,funding_program_y,5311_5310_overlap
1487,,Lighthouse for the Blind and Visually Impaired,5310 Exp,5310 only
1148,Section 5311,County of Sacramento Department of Transportation,,5311 only
782,5311(f) Round 2,Redwood Coast Transit Authority,5310 Trad,Both 5311 and 5310
1872,,The Respite Inn,5310 Trad,5310 only
987,Section 5311,City of California City,,5311 only
978,Section 5311,City of Arcata,,5311 only
1722,,Loma Linda University Health,5310 Trad,5310 only
1921,,Valley Achievement Center,5310 Trad,5310 only
1341,Section 5311,Town of Truckee,5310 Exp,Both 5311 and 5310
1225,Section 5311,Marin County Transit District,,5311 only


In [65]:
# Now merge in 5339 with the merged 5311 & 5310 stuff
m3_all = pd.merge(
    m2_5311_5310,
    df_5339,
    how="outer",
    on = ["organization_name"],
    indicator=True,
)

In [66]:
# Again drop the duplicates of organizations
m4 = m3_all.drop_duplicates(subset=["organization_name"])

In [67]:
m4["_merge"].value_counts()

left_only     152
both           25
right_only      0
Name: _merge, dtype: int64

In [69]:
# Look at organizations A-Z
m4[['organization_name','5311_5310_overlap','_merge']].sort_values('_merge')

Unnamed: 0,organization_name,5311_5310_overlap,_merge
174,Yurok Tribe Transit,5311 only,left_only
198,Metropolitan Transportation Commission,5310 only,left_only
199,Mobility Matters,5310 only,left_only
200,NCI Affiliates,5310 only,left_only
201,OPARC,5310 only,left_only
202,"Outreach & Escort, Inc.",5310 only,left_only
203,Peninsula Family Service,5310 only,left_only
204,Peninsula Jewish Community Center,5310 only,left_only
205,Peppermint Ridge,5310 only,left_only
197,Lutheran Social Services of Southern California,5310 only,left_only


In [None]:
# Use a function to replace left_only and both 
# https://github.com/cal-itp/data-analyses/blob/main/grant_misc/A2_dla.ipynb
# df is the argument of the function
def recategorize(df):   
    if (df['_merge']=='left_only') and (df['5311_5310_overlap'] == '5311 only'):
        return '5311 Only'
    elif (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'TIRCP_Only'):
        return 'TIRCP_Only'
    elif (df['_merge']=='right_only') and (df['BC_TIRCP_merge'] == 'Both in TIRCP and BlackCat'):
        return 'TIRCP and BlackCat'
    elif (df['_merge']=='left_only'):
        return 'DLA Only'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'TIRCP Only'):
        return 'TIRCP and DLA'
    elif (df['_merge']=='both') and (df['BC_TIRCP_merge'] == 'Black Cat Only'):
        return 'BlackCat and DLA'
    else: 
        return "TIRCP, BlackCat and DLA"
    

In [None]:
# Apply a function along an axis of the DataFrame. 
# Axis = 1 means across each row of the df 
# Axis = 0 means across each column of the df 
m4['_merge'] = m4.apply(recategorize, axis = 1)

In [None]:
# Drop any columns you don't want 
m4 = m4.drop(columns = [all the columns you don't want]) 

In [None]:
"""
Indicator values are both/left/only. You can 
change the values to something like 'both 5310 and 5311',
'5311 only', etc. Scroll back up to the 'duplicate values'
section to change these values with a dictionary.
"""
# Create a new copy of column _merge

## Save your work
* You can save all your hardwork into a single Excel workbook to our [Google Cloud Storage](https://console.cloud.google.com/storage/browser/calitp-analytics-data/data-analyses/grants;tab=objects?project=cal-itp-data-infra&prefix=&forceOnObjectsSortingFiltering=false).

In [None]:
""" This will be saved to our GCS bucket.
with pd.ExcelWriter(
    "gs://calitp-analytics-data/data-analyses/grants/5310-5311.xlsx"
) as writer:
    m4.to_excel(writer, sheet_name="5310-5311-years", index= False)
    """