# Consolidated application data

In [1]:
import re as re

import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from itertools import chain

from calitp import *
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/consolidated_applications/"

In [2]:
data = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}Copy of Application_Review_Report_5_2_2022.xls")
)

In [3]:
data.shape

(346, 24)

In [4]:
data.isna().sum()

organization_name                            0
district                                     6
application_name                             0
year                                         0
application_status                           0
project_upin                                 0
project_category                             0
project_line_item__ali_                      0
project_description                          0
is_stimulus                                  0
consolidated_application                     0
total_expenses                               0
_5311_funds                                  7
_5311_f__funds                               7
_5311_cmaq_funds                             7
_5339_funds                                  7
federal_total                                0
other_fed_funds_total                        7
lctop__state__funds                          7
sb1__state_of_good_repair__state__funds      7
transit_development_act__state__funds        7
other_state_f

In [5]:
data.dtypes

organization_name                           object
district                                   float64
application_name                            object
year                                         int64
application_status                          object
project_upin                                object
project_category                            object
project_line_item__ali_                     object
project_description                         object
is_stimulus                                 object
consolidated_application                    object
total_expenses                               int64
_5311_funds                                float64
_5311_f__funds                             float64
_5311_cmaq_funds                           float64
_5339_funds                                float64
federal_total                                int64
other_fed_funds_total                      float64
lctop__state__funds                        float64
sb1__state_of_good_repair__stat

In [6]:
data = data.drop(columns=["application_name", "consolidated_application"])

## Organizations

In [7]:
# Remove any acronyms
data["organization_name"] = data["organization_name"].str.replace(
    "\s+\(.*$", "", regex=True
)

In [8]:
# Replace Ventura, read in weirdly
data["organization_name"] = data["organization_name"].replace(
    {
        "Ventura County Transportation Commission\xa0": "Ventura County Transportation Commission"
    }
)

## Project Columns

In [9]:
## Change acronyms to full name. What does PL and CM mean?
data.project_category.unique()

array(['OP', 'CA', 'PL', 'CM'], dtype=object)

In [10]:
data["project_category"] = data["project_category"].replace(
    {
        "OP": "Operating",
        "CA": "Capital",
    }
)

### Project Descriptions
The descriptions are long and there are 200+ of them. The project category included in the data set is pretty vague, manipulate for something in between.

In [11]:
data["project_description"] = data["project_description"].str.lower()

In [12]:
data.project_description.nunique()

206

In [13]:
data.project_description.unique()

array(['operating assistance (lctop only - project # 1)',
       'purchase misc communications equip (lctop only - project # 2)',
       'operating assistance sliding scale',
       'purchase replacement < 30 ft bus',
       'lctop - purchase replacement van',
       'operating assistance sliding scale - ffy2022',
       'operating assistance sliding scale - ffy2023',
       'operating assistance - free fares',
       'operating assistance sliding scale fy 22',
       'operating assistance sliding scale fy 23',
       'buy 35-39 ft bus for expansion',
       'support equip/facilities-equipment (microgrid etc)',
       'operating assistance', 'operating assistance - (lctop only) #1',
       'operating assistance (lctop only)',
       'operating assistance sliding scale ffy2022',
       'operating assistance sliding scale ffy2023',
       'purchase replacement std 30-34\xa0ft bus (lctop only)',
       'construction - bus shelters (lctop only)',
       'operating assistance sliding scale 

In [14]:
data["short_description"] = data["project_description"].str.extract(
    "(operating|bus|construction|buses|planning|van|vessel|fares|ridership|vehicle|station|service|equipment|maintenance|surveillance|renovate|free|equip|operational)",
    expand=False,
)

In [15]:
data["short_description"] = data["short_description"].replace(
    {
        'operating': 'operating assistance',
        'operational':'operating assistance',
        'free': 'ridership expansion',
        'ridership': 'ridership expansion',
        'fare': 'ridership expansion',
        'service':'service expansion',
        'buses':'purchasing vehicles',
        'bus':'purchasing vehicles',
        'van':'purchasing vehicles',
        'vessel':'purchasing vehicles',
        'vehicles':'purchasing vehicles',
        'vehicle':'purchasing vehicles',
        'planning':'transit planning',
        'station':'construction related',
        'construction':'construction related',
        'maintenance':'construction related',
        'renovate':'construction related',
        'equipment':'purchasing other tech',
        'equip':'purchasing other tech',
        'surveillance':'purchasing other tech',
    }
)

In [16]:
data['short_description'] = data['short_description'].fillna('other category')

In [17]:
data.short_description.value_counts()

operating assistance     204
purchasing vehicles       89
construction related      24
ridership expansion        8
purchasing other tech      7
other category             6
service expansion          5
transit planning           3
Name: short_description, dtype: int64

## Monetary Columns

<b> Local Total </b> 
* This column represents the different types of local funding a project can receive.
* Extract everything after the colons. 

In [18]:
data["local_total"] = data["local_total"].str.split(": ").str[-1]

In [19]:
data["local_total"] = (
    data["local_total"]
    .str.replace(",", "", regex=True)
    .str.replace("$", "", regex=True)
    .fillna(0)
    .astype("float")
)

In [20]:
monetary_cols = [
    "total_expenses",
    "_5311_funds",
    "_5311_f__funds",
    "_5311_cmaq_funds",
    "_5339_funds",
    "federal_total",
    "other_fed_funds_total",
    "lctop__state__funds",
    "sb1__state_of_good_repair__state__funds",
    "transit_development_act__state__funds",
    "other_state_funds",
    "state_total",
]

In [21]:
# Clean up monetary columns
data[monetary_cols] = (
    data[monetary_cols]
    .fillna(value=0)
    .apply(pd.to_numeric, errors="coerce")
    .astype("float")
)

In [22]:
data["total_state_federal_local_funding"] = (
    data["other_state_funds"]
    + data["state_total"]
    + data["local_total"]
    + data["federal_total"]
    + data["other_fed_funds_total"]
)

In [23]:
def funding_vs_expenses(df):
    if df["total_state_federal_local_funding"] == df["total_expenses"]:
        return "fully funded"
    elif df["total_state_federal_local_funding"] > df["total_expenses"]:
        return "funding > total expenses"
    else:
        return "not fully funded"

In [24]:
data["fully_funded"] = data.apply(funding_vs_expenses, axis=1)

In [25]:
data.fully_funded.value_counts()

fully funded                227
funding > total expenses     84
not fully funded             35
Name: fully_funded, dtype: int64

## Melt dataframe
* Every project has a unique upin number, use that as the merge column.

In [26]:
len(data)

346

In [27]:
data.project_upin.nunique()

346

In [28]:
data.columns

Index(['organization_name', 'district', 'year', 'application_status',
       'project_upin', 'project_category', 'project_line_item__ali_',
       'project_description', 'is_stimulus', 'total_expenses', '_5311_funds',
       '_5311_f__funds', '_5311_cmaq_funds', '_5339_funds', 'federal_total',
       'other_fed_funds_total', 'lctop__state__funds',
       'sb1__state_of_good_repair__state__funds',
       'transit_development_act__state__funds', 'other_state_funds',
       'state_total', 'local_total', 'short_description',
       'total_state_federal_local_funding', 'fully_funded'],
      dtype='object')

In [29]:
# Keeping only monetary columns I want to melt
monetary_subset = data[
    [
        "project_upin",
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
        "other_state_funds",
        "other_fed_funds_total",
        "local_total",
    ]
]

In [30]:
monetary_subset = pd.melt(
    monetary_subset,
    id_vars=["project_upin"],
    value_vars=[
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
    ],
    var_name="program_name",
    value_name="funding_received",
)

In [31]:
# Delete some irrelevant cols from original data set
data2 = data[
    [
        "total_expenses",
        "organization_name",
        "district",
        "year",
        "application_status",
        "project_upin",
        "project_category",
        "project_line_item__ali_",
        "project_description",
        "is_stimulus",
        "total_state_federal_local_funding",
        "fully_funded",
        "short_description",
    ]
]

In [32]:
# merge
merge1 = pd.merge(monetary_subset, data2, on="project_upin", how="left")

In [33]:
# looking at new length of merge...
len(merge1)

2422

In [34]:
# double checking that project upin count is still the same
merge1.project_upin.nunique()

346

In [35]:
# filter any zeroes in the funding received column, to make dataframe smaller
merge2 = merge1[merge1["funding_received"] > 0]

In [36]:
merge2.shape

(422, 15)

In [37]:
merge2.head(10)

Unnamed: 0,project_upin,program_name,funding_received,total_expenses,organization_name,district,year,application_status,project_category,project_line_item__ali_,project_description,is_stimulus,total_state_federal_local_funding,fully_funded,short_description
2,BCG0003876,_5311_funds,311252.0,853394.0,Amador Transit,10.0,2022,Submitted,Operating,300902,operating assistance sliding scale,No,870724.0,funding > total expenses,operating assistance
3,BCG0003877,_5311_funds,317477.0,916170.0,Amador Transit,10.0,2022,Submitted,Operating,300902,operating assistance sliding scale,No,932477.0,funding > total expenses,operating assistance
8,BCG0003914,_5311_funds,995458.0,1886992.0,Butte County Association of Governments/ Butte Regional Transit,3.0,2022,Submitted,Operating,300902,operating assistance sliding scale,No,1886993.0,funding > total expenses,operating assistance
10,BCG0003916,_5311_funds,1015367.0,2117793.0,Butte County Association of Governments/ Butte Regional Transit,3.0,2022,Submitted,Operating,300902,operating assistance sliding scale,No,2117793.0,fully funded,operating assistance
12,BCG0004040,_5311_funds,501709.0,906758.0,Calaveras Transit Agency,10.0,2022,Submitted,Operating,300901,operating assistance sliding scale - ffy2022,No,1497643.0,funding > total expenses,operating assistance
13,BCG0004041,_5311_funds,493410.0,891759.0,Calaveras Transit Agency,10.0,2022,Submitted,Operating,300902,operating assistance sliding scale - ffy2023,No,1489344.0,funding > total expenses,operating assistance
15,BCG0003996,_5311_funds,151979.0,866979.0,City of Arvin,6.0,2022,Submitted,Operating,300902,operating assistance sliding scale fy 22,No,866979.0,fully funded,operating assistance
16,BCG0004009,_5311_funds,155019.0,885019.0,City of Arvin,6.0,2022,Submitted,Operating,300902,operating assistance sliding scale fy 23,No,885019.0,fully funded,operating assistance
20,BCG0003821,_5311_funds,78492.0,600240.0,City of Auburn,3.0,2022,Submitted,Operating,300901,operating assistance,No,600240.0,fully funded,operating assistance
23,BCG0003842,_5311_funds,97481.0,263507.0,City of California City,9.0,2022,Submitted,Operating,300902,operating assistance sliding scale ffy2022,No,263507.0,fully funded,operating assistance


## Basics
* Most common program orgs applied for: 5311, LCTOP, transit development act state funds.
* Organizations applied to around 2 different programs.

In [38]:
merge2.program_name.value_counts()

_5311_funds                                147
lctop__state__funds                        122
transit_development_act__state__funds       66
_5339_funds                                 40
_5311_f__funds                              27
sb1__state_of_good_repair__state__funds     13
_5311_cmaq_funds                             7
Name: program_name, dtype: int64

In [39]:
orgs_programs = (
    merge2.groupby(["organization_name"])
    .agg({"program_name": "nunique"})
    .sort_values("program_name", ascending=False)
    .rename(columns={"program_name": "number_of_programs_applied"})
)

In [40]:
orgs_programs["number_of_programs_applied"].median()

2.0

In [41]:
orgs_programs.sort_values("number_of_programs_applied", ascending=False).head(5)

Unnamed: 0_level_0,number_of_programs_applied
organization_name,Unnamed: 1_level_1
Eastern Sierra Transit Authority,6
Plumas County Transportation Commission,6
Humboldt Transit Authority,5
Lake Transit Authority,4
Mountain Area Regional Transit Authority,4


In [42]:
merge2.groupby(["organization_name", "program_name"]).agg({"funding_received": "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,funding_received
organization_name,program_name,Unnamed: 2_level_1
Alameda-Contra Costa Transit District,lctop__state__funds,8178530.0
Amador Transit,_5311_funds,628729.0
Amador Transit,_5339_funds,522000.0
Amador Transit,lctop__state__funds,159472.0
Butte County Association of Governments/ Butte Regional Transit,_5311_f__funds,300000.0
Butte County Association of Governments/ Butte Regional Transit,_5311_funds,2010825.0
Butte County Association of Governments/ Butte Regional Transit,transit_development_act__state__funds,2269173.0
Calaveras Transit Agency,_5311_funds,995119.0
Calaveras Transit Agency,lctop__state__funds,111022.0
Calaveras Transit Agency,transit_development_act__state__funds,848634.0


## Export

In [43]:
# merge2.to_csv(f"./dataframe.csv", index=False)