# Consolidated application data

In [1]:
import re as re

import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain
from collections import Counter
from itertools import combinations

from calitp import *
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/consolidated_applications/"

In [2]:
data = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}Copy of Application_Review_Report_5_2_2022.xls")
)

In [3]:
data.shape

(346, 24)

In [4]:
data.isna().sum()

organization_name                            0
district                                     6
application_name                             0
year                                         0
application_status                           0
project_upin                                 0
project_category                             0
project_line_item__ali_                      0
project_description                          0
is_stimulus                                  0
consolidated_application                     0
total_expenses                               0
_5311_funds                                  7
_5311_f__funds                               7
_5311_cmaq_funds                             7
_5339_funds                                  7
federal_total                                0
other_fed_funds_total                        7
lctop__state__funds                          7
sb1__state_of_good_repair__state__funds      7
transit_development_act__state__funds        7
other_state_f

In [5]:
data.dtypes

organization_name                           object
district                                   float64
application_name                            object
year                                         int64
application_status                          object
project_upin                                object
project_category                            object
project_line_item__ali_                     object
project_description                         object
is_stimulus                                 object
consolidated_application                    object
total_expenses                               int64
_5311_funds                                float64
_5311_f__funds                             float64
_5311_cmaq_funds                           float64
_5339_funds                                float64
federal_total                                int64
other_fed_funds_total                      float64
lctop__state__funds                        float64
sb1__state_of_good_repair__stat

In [6]:
f"{data.organization_name.nunique()} unique organizations"

'121 unique organizations'

In [7]:
f"{data.project_upin.nunique()} different projects" 

'346 different projects'

In [8]:
data.consolidated_application.value_counts()

Yes    343
No       3
Name: consolidated_application, dtype: int64

In [9]:
data.application_status.value_counts()

Submitted        338
Not Submitted      8
Name: application_status, dtype: int64

In [10]:
data.head(1)

Unnamed: 0,organization_name,district,application_name,year,application_status,project_upin,project_category,project_line_item__ali_,project_description,is_stimulus,consolidated_application,total_expenses,_5311_funds,_5311_f__funds,_5311_cmaq_funds,_5339_funds,federal_total,other_fed_funds_total,lctop__state__funds,sb1__state_of_good_repair__state__funds,transit_development_act__state__funds,other_state_funds,state_total,local_total
0,Alameda-Contra Costa Transit District (AC Transit),4.0,Consolidated Application (Consolidated Application),2022,Submitted,BCG0003649,OP,300901,Operating Assistance (LCTOP Only - Project # 1),No,Yes,27795000,0.0,0.0,0.0,0.0,0,0.0,6546867.0,0.0,0.0,0.0,6546867.0,"District operating funds: $21,248,133.00\nTotal Local: $21,248,133.00"


## Organizations

In [11]:
# Remove any acronyms
data["organization_name"] = data["organization_name"].str.replace(
    "\s+\(.*$", "", regex=True
)

In [12]:
# Replace Ventura, read in weirdly
data["organization_name"] = data["organization_name"].replace(
    {
        "Ventura County Transportation Commission\xa0": "Ventura County Transportation Commission"
    }
)

## Project Columns

In [13]:
## Change acronyms to full name. What does PL and CM mean?
data.project_category.unique()

array(['OP', 'CA', 'PL', 'CM'], dtype=object)

In [14]:
data["project_category"] = data["project_category"].replace(
    {
        "OP": "Operating",
        "CA": "Capital",
    }
)

### Project Descriptions
The descriptions are long and there are 200+ of them. The project category included in the data set is pretty vague, manipulate for something in between.

In [15]:
data["project_description"] = data["project_description"].str.lower()

In [16]:
data.project_description.nunique()

206

In [17]:
#data.project_description.unique()

In [18]:
data["short_description"] = data["project_description"].str.extract(
    "(operating|bus|construction|buses|planning|van|vessel|fares|ridership|vehicle|station|service|equipment|maintenance|surveillance|renovate|free|equip|operational)",
    expand=False,
)

In [19]:
data["short_description"] = data["short_description"].replace(
    {
        "operating": "operating assistance",
        "operational": "operating assistance",
        "free": "free fare program",
        "ridership": "ridership expansion",
        "fare": "free fare programs",
        "service": "service expansion",
        "buses": "purchasing vehicles",
        "bus": "purchasing vehicles",
        "van": "purchasing vehicles",
        "vessel": "purchasing vehicles",
        "vehicles": "purchasing vehicles",
        "vehicle": "purchasing vehicles",
        "planning": "transit planning",
        "station": "construction",
        "construction": "construction",
        "maintenance": "maintenance/renovation",
        "renovate": "maintenance/renovation",
        "equipment": "purchasing other tech",
        "equip": "purchasing other tech",
        "surveillance": "purchasing other tech",
    }
)

In [20]:
data["short_description"] = data["short_description"].fillna("other category")

In [21]:
data["short_description"]= data["short_description"].str.title()

In [22]:
data.loc[data['short_description'] == 'Other Category']

Unnamed: 0,organization_name,district,application_name,year,application_status,project_upin,project_category,project_line_item__ali_,project_description,is_stimulus,consolidated_application,total_expenses,_5311_funds,_5311_f__funds,_5311_cmaq_funds,_5339_funds,federal_total,other_fed_funds_total,lctop__state__funds,sb1__state_of_good_repair__state__funds,transit_development_act__state__funds,other_state_funds,state_total,local_total,short_description
156,Lake Transit Authority,1.0,Consolidated Application (Consolidated Application),2022,Submitted,BCG0003695,Capital,114220,solar canopy project,No,Yes,463988,0.0,0.0,0.0,0.0,0,0.0,463988.0,0.0,0.0,0.0,463988.0,,Other Category
213,Napa Valley Transportation Authority,4.0,Consolidated Application (Consolidated Application),2022,Submitted,BCG0003662,Operating,300902,fy22 5311f funds,Yes,Yes,497018,0.0,275000.0,0.0,0.0,275000,0.0,0.0,0.0,222018.0,0.0,222018.0,,Other Category
214,Napa Valley Transportation Authority,4.0,Consolidated Application (Consolidated Application),2022,Submitted,BCG0003663,Operating,300902,fy22 5311 funds,Yes,Yes,533468,0.0,295168.0,0.0,0.0,295168,0.0,0.0,0.0,238300.28,0.0,238300.28,,Other Category
215,Napa Valley Transportation Authority,4.0,Consolidated Application (Consolidated Application),2022,Submitted,BCG0003666,Operating,300902,fy23 5311 funds,Yes,Yes,544139,301072.0,0.0,0.0,0.0,301072,0.0,0.0,0.0,243066.8,0.0,243066.8,,Other Category
260,San Bernardino County Transportation Authority,8.0,Consolidated Application (Consolidated Application),2022,Submitted,BCG0003746,Operating,300901,san bernardino line double tracking - capital (lctop only),No,Yes,52455000,0.0,0.0,0.0,0.0,0,0.0,2000000.0,0.0,0.0,0.0,2000000.0,,Other Category
324,Tuolumne County Transit Agency,10.0,Consolidated Application (Consolidated Application),2022,Not Submitted,BCG0003792,Operating,300901,lctop,No,Yes,0,0.0,0.0,0.0,0.0,0,0.0,133640.0,0.0,0.0,0.0,133640.0,,Other Category


In [23]:
data.short_description.value_counts()

Operating Assistance      204
Purchasing Vehicles        89
Construction               16
Maintenance/Renovation      8
Purchasing Other Tech       7
Free Fare Program           7
Other Category              6
Service Expansion           5
Transit Planning            3
Ridership Expansion         1
Name: short_description, dtype: int64

In [24]:
#data[['project_description','short_description']].tail(250)

## Monetary Columns

<b> Local Total </b> 
* This column represents the different types of local funding a project can receive.
* Extract everything after the colons. 

In [25]:
data["local_total"] = data["local_total"].str.split(": ").str[-1]

In [26]:
data["local_total"] = (
    data["local_total"]
    .str.replace(",", "", regex=True)
    .str.replace("$", "", regex=True)
    .fillna(0)
    .astype("float")
)

In [27]:
monetary_cols = [
    "total_expenses",
    "_5311_funds",
    "_5311_f__funds",
    "_5311_cmaq_funds",
    "_5339_funds",
    "federal_total",
    "other_fed_funds_total",
    "lctop__state__funds",
    "sb1__state_of_good_repair__state__funds",
    "transit_development_act__state__funds",
    "other_state_funds",
    "state_total",
]

In [28]:
# Clean up monetary columns
data[monetary_cols] = (
    data[monetary_cols]
    .fillna(value=0)
    .apply(pd.to_numeric, errors="coerce")
    .astype("float")
)

In [29]:
data["total_state_federal_local_funding"] = (
    
    data["state_total"]
    + data["local_total"]
    + data["federal_total"]
    + data['other_state_funds']
    + data['other_fed_funds_total']
)

In [30]:
def funding_vs_expenses(df):
    if df["total_state_federal_local_funding"] == df["total_expenses"]:
        return "Fully funded"
    elif df["total_state_federal_local_funding"] > df["total_expenses"]:
        return "Funding exceeds total expenses"
    else:
        return "Not fully funded"

In [31]:
data["fully_funded"] = data.apply(funding_vs_expenses, axis=1)

In [32]:
data.fully_funded.value_counts()

Fully funded                      227
Funding exceeds total expenses     84
Not fully funded                   35
Name: fully_funded, dtype: int64

In [33]:
data[['project_upin', 'local_total']].sample(4)

Unnamed: 0,project_upin,local_total
166,BCG0003939,530121.0
75,BCG0003885,19275.0
23,BCG0003842,166026.0
279,BCG0003705,0.0


## Melt dataframe
* Every project has a unique upin number, use that as the merge column.

In [34]:
len(data)

346

In [35]:
data.project_upin.nunique()

346

In [36]:
data.columns

Index(['organization_name', 'district', 'application_name', 'year',
       'application_status', 'project_upin', 'project_category',
       'project_line_item__ali_', 'project_description', 'is_stimulus',
       'consolidated_application', 'total_expenses', '_5311_funds',
       '_5311_f__funds', '_5311_cmaq_funds', '_5339_funds', 'federal_total',
       'other_fed_funds_total', 'lctop__state__funds',
       'sb1__state_of_good_repair__state__funds',
       'transit_development_act__state__funds', 'other_state_funds',
       'state_total', 'local_total', 'short_description',
       'total_state_federal_local_funding', 'fully_funded'],
      dtype='object')

In [37]:
# Keeping only monetary columns I want to melt
monetary_subset = data[
    [
        "project_upin",
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
        "other_state_funds",
        "other_fed_funds_total",
        "local_total",
    ]
]

In [38]:
monetary_subset = pd.melt(
    monetary_subset,
    id_vars=["project_upin"],
    value_vars=[
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
         "other_state_funds",
        "other_fed_funds_total",
        "local_total",
    ],
    var_name="program_name",
    value_name="funding_received",
)

In [39]:
# Delete some irrelevant cols from original data set
data2 = data[
    [
        "total_expenses",
        "organization_name",
        "district",
        "year",
        "application_status",
        "project_upin",
        "project_category",
        "project_line_item__ali_",
        "project_description",
        "is_stimulus",
        "total_state_federal_local_funding",
        "fully_funded",
        "short_description",
    ]
]

In [40]:
# merge original dataframe with melted dataframe to get full information.
merge1 = pd.merge(monetary_subset, data2, on="project_upin", how="left")

In [41]:
merge1.program_name.unique()

array(['_5311_funds', '_5311_f__funds', '_5311_cmaq_funds', '_5339_funds',
       'lctop__state__funds', 'sb1__state_of_good_repair__state__funds',
       'transit_development_act__state__funds', 'other_state_funds',
       'other_fed_funds_total', 'local_total'], dtype=object)

In [42]:
merge1["program_name"] = merge1["program_name"].replace(
    {
        '_5311_funds':'5311 (Fed)',
        'lctop__state__funds': 'LCTOP (State)',
        'transit_development_act__state__funds':'Transit Development Act (State)',
        'other_state_funds':'Other State Funds',
        '_5339_funds': '5339 (Fed)',
        '_5311_f__funds': '5311(f) (Fed)',
        'sb1__state_of_good_repair__state__funds': 'SB1. State of Good Repair (State)',
        'other_fed_funds_total': 'Other Federal Funds',
        '_5311_cmaq_funds': '5311 CMAQ (Fed)',
        'local_total': 'Local Funds'
        
    }
)

In [43]:
# looking at new length of merge...
len(merge1)

3460

In [44]:
# double checking that project upin count is still the same
merge1.project_upin.nunique()

346

In [45]:
merge1.program_name.unique()

array(['5311 (Fed)', '5311(f) (Fed)', '5311 CMAQ (Fed)', '5339 (Fed)',
       'LCTOP (State)', 'SB1. State of Good Repair (State)',
       'Transit Development Act (State)', 'Other State Funds',
       'Other Federal Funds', 'Local Funds'], dtype=object)

In [46]:
# filter any zeroes in the funding received column, to make dataframe smaller
melt_df = merge1[merge1["funding_received"] > 0]

In [47]:
melt_df.shape

(641, 15)

In [48]:
melt_df.sample()

Unnamed: 0,project_upin,program_name,funding_received,total_expenses,organization_name,district,year,application_status,project_category,project_line_item__ali_,project_description,is_stimulus,total_state_federal_local_funding,fully_funded,short_description
26,BCG0003814,5311 (Fed),92579.0,344462.0,City of Chowchilla,6.0,2022,Submitted,Operating,300902,operating assistance sliding scale - ffy2023,No,344462.0,Fully funded,Operating Assistance


### Why do upins  suddenly disappear??? 
* Investigation: look at  project upin #s in original dataframe versus pivoted.
* When I pivotted the data, I only included the granular categories (5311/sb1/etc), not the totals. I also filtered out for any $0.
* Found: Projects in "main_list" either
    * Has 0.00 in federal_total, local_total, and state_total
    * Somehow has 0.00 in 5311/5311f/5311cmaq/5339 funds but has the federal_total populated.
    

In [49]:

melt_df.project_upin.nunique()

339

In [50]:
#Investigate
melted_upin = melt_df.project_upin.unique().tolist()

In [51]:
len(melted_upin)

339

In [52]:
data_upin = data.project_upin.unique().tolist()

In [53]:
len(data_upin)

346

In [54]:
main_list = np.setdiff1d(data_upin,melted_upin)
main_list

array(['BCG0003702', 'BCG0003954', 'BCG0003959', 'BCG0004002',
       'BCG0004004', 'BCG0004030', 'BCG0004032'], dtype='<U10')

In [55]:
missing_upin = data[data["project_upin"].isin(['BCG0003702', 'BCG0003954', 'BCG0003959', 'BCG0004002',
       'BCG0004004', 'BCG0004030', 'BCG0004032'])]

In [56]:
# Exporting a dataframe to csv
#missing_upin.to_csv("./dataframe.csv", index=False)

## Program Groups

In [57]:
#filter out local funds
group = melt_df.loc[melt_df["program_name"] != "Local Funds"]

In [58]:
group.program_name.unique()

array(['5311 (Fed)', '5311(f) (Fed)', '5311 CMAQ (Fed)', '5339 (Fed)',
       'LCTOP (State)', 'SB1. State of Good Repair (State)',
       'Transit Development Act (State)', 'Other State Funds',
       'Other Federal Funds'], dtype=object)

In [59]:
group.head(2)

Unnamed: 0,project_upin,program_name,funding_received,total_expenses,organization_name,district,year,application_status,project_category,project_line_item__ali_,project_description,is_stimulus,total_state_federal_local_funding,fully_funded,short_description
2,BCG0003876,5311 (Fed),311252.0,853394.0,Amador Transit,10.0,2022,Submitted,Operating,300902,operating assistance sliding scale,No,870724.0,Funding exceeds total expenses,Operating Assistance
3,BCG0003877,5311 (Fed),317477.0,916170.0,Amador Transit,10.0,2022,Submitted,Operating,300902,operating assistance sliding scale,No,932477.0,Funding exceeds total expenses,Operating Assistance


In [60]:
#grab all the different program names by project upin and put it in a new column
group["all_programs"] = group.groupby("project_upin")["program_name"].transform(
    lambda x: ",".join(x)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group["all_programs"] = group.groupby("project_upin")["program_name"].transform(


In [61]:
group = group[["project_upin", "all_programs"]].drop_duplicates()

In [62]:
group.shape

(335, 2)

In [63]:
group[['project_upin','all_programs']].head(2)

Unnamed: 0,project_upin,all_programs
2,BCG0003876,"5311 (Fed),LCTOP (State)"
3,BCG0003877,"5311 (Fed),LCTOP (State)"


In [64]:
#merge for other information
grouped_df = pd.merge(group, data, on="project_upin", how="left")

In [65]:
#keep only relevant cols
grouped_df = grouped_df[["project_upin", "organization_name", "project_description","all_programs", "year"]]

In [66]:
#count number of items in all programs
# https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
grouped_df["count_of_funding_programs_applied"] = (
    grouped_df["all_programs"]
    .str.split(",+")
    .str.len()
    .groupby(grouped_df.project_upin)
    .transform("sum")
)

In [67]:
#drop duplicates 
#grouped_df = grouped_df.drop_duplicates(subset=["organization_name", "all_programs", "year"])

In [68]:
grouped_df.sort_values('organization_name').head()

Unnamed: 0,project_upin,organization_name,project_description,all_programs,year,count_of_funding_programs_applied
221,BCG0003649,Alameda-Contra Costa Transit District,operating assistance (lctop only - project # 1),LCTOP (State),2022,1
222,BCG0003684,Alameda-Contra Costa Transit District,purchase misc communications equip (lctop only - project # 2),LCTOP (State),2022,1
184,BCG0003932,Amador Transit,purchase replacement < 30 ft bus,5339 (Fed),2022,1
183,BCG0003931,Amador Transit,purchase replacement < 30 ft bus,5339 (Fed),2022,1
182,BCG0003930,Amador Transit,purchase replacement < 30 ft bus,5339 (Fed),2022,1


## Export


with pd.ExcelWriter(f"{GCS_FILE_PATH}Con_App_Cleaned.xlsx") as writer:
    melt_df.to_excel(writer, sheet_name="pivoted_data", index=False)
    data.to_excel(writer, sheet_name="cleaned_unpivoted_data", index=False)
    grouped_df.to_excel(writer, sheet_name="combos_of_funding_programs", index=False)


## Some Analysis
### Looking at LCTOP

In [69]:
lctop_combos = grouped_df[(grouped_df.all_programs.str.contains("lctop", case=False))]

In [70]:
 lctop_combos = lctop_combos[(lctop_combos['count_of_funding_programs_applied'] > 1)]

In [71]:
lctop_combos = lctop_combos.groupby(['all_programs']).agg({'organization_name':'count'}).sort_values('organization_name', ascending = False).reset_index()

In [72]:
lctop_combos.loc["Grand_total"] = lctop_combos.sum(numeric_only=True)

In [79]:
lctop_combos = lctop_combos.rename(columns = {'all_programs':'program_combos', 'organization_name':'count of orgs who applied for this combo'})

In [80]:
lctop_combos

Unnamed: 0,program_combos,count of orgs who applied for this combo
0,"LCTOP (State),Other State Funds",5.0
1,"5311 (Fed),LCTOP (State)",3.0
2,"5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Other State Funds",2.0
3,"5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Transit Development Act (State),Other State Funds",2.0
4,"5311 (Fed),LCTOP (State),Transit Development Act (State)",2.0
5,"5311 (Fed),LCTOP (State),Transit Development Act (State),Other State Funds,Other Federal Funds",2.0
6,"LCTOP (State),SB1. State of Good Repair (State)",2.0
7,"5311 (Fed),LCTOP (State),Other Federal Funds",1.0
8,"5311 CMAQ (Fed),LCTOP (State),Other State Funds",1.0
9,"LCTOP (State),Other Federal Funds",1.0


### Looking at combos of programs orgs applied to

In [74]:
organizations_combo = grouped_df.groupby(['organization_name', 'all_programs']).agg({'count_of_funding_programs_applied':'max','all_programs':'count'})

In [75]:
organizations_combo = organizations_combo.rename(columns = {'count_of_funding_programs_applied':'# of programs','all_programs': 'how many times the org applied for this particular combo'})

In [76]:
organizations_combo

Unnamed: 0_level_0,Unnamed: 1_level_0,# of programs,how many times the org applied for this particular combo
organization_name,all_programs,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda-Contra Costa Transit District,LCTOP (State),1,2
Amador Transit,"5311 (Fed),LCTOP (State)",2,2
Amador Transit,5339 (Fed),1,4
Butte County Association of Governments/ Butte Regional Transit,"5311 (Fed),Transit Development Act (State)",2,2
Butte County Association of Governments/ Butte Regional Transit,"5311(f) (Fed),Transit Development Act (State)",2,1
Calaveras Transit Agency,"5311 (Fed),Transit Development Act (State)",2,2
Calaveras Transit Agency,LCTOP (State),1,1
City of Arcata,LCTOP (State),1,1
City of Arvin,"5311 (Fed),Transit Development Act (State)",2,2
City of Arvin,5339 (Fed),1,3
