# Lesson Learned

In [1]:
import re as re

import geopandas as gpd
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from collections import Counter
from itertools import chain, combinations

import altair as alt
import branca
import folium
import shared_utils
import utilities
from calitp import *
from shared_utils import altair_utils, styleguide
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/consolidated_applications/"



In [2]:
FILE_NAME = "Con_App_Cleaned.xlsx"

In [3]:
pivoted_data = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="pivoted_data")

cleaned_unpivoted_data = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="cleaned_unpivoted_data"
)
grouped_df = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="combos_of_funding_programs"
)

In [4]:
# df_geometry = pd.read_parquet(f"{GCS_FILE_PATH}con_app_gdf.parquet", engine="auto")

In [5]:
# df_geometry = gpd.GeoDataFrame(df_geometry, crs="EPSG:2229")

## Basics
* 121 different organizations applied.
* 346 different applications - determined by project upin numbers.
* Only 3 applications were not submitted using consolidated application.
* 8 applications were not submitted. 

In [6]:
cleaned_unpivoted_data.project_upin.nunique()

346

In [7]:
cleaned_unpivoted_data.consolidated_application.value_counts()

Yes    343
No       3
Name: consolidated_application, dtype: int64

In [8]:
cleaned_unpivoted_data.application_status.value_counts()

Submitted        338
Not Submitted      8
Name: application_status, dtype: int64

## Total Funds Requested: $659,171,700.79 CHANGE: just for state total and federal total.
* Only includes 'state total', 'other federal funds', 'federal total.'

In [9]:
totals = cleaned_unpivoted_data[['total_state_fed_only']]
totals = totals.append(
        totals.sum(numeric_only=True), ignore_index=True).tail(1)
totals



Unnamed: 0,total_state_fed_only
346,659171700.79


## How much funding was requested - State v. Federal Grants?
<img src="bar_funding source_by_total.png">

In [10]:
funding_all = cleaned_unpivoted_data[['state_total','federal_total']]

In [11]:
funding_all = funding_all.append(
        funding_all.sum(numeric_only=True), ignore_index=True).tail(1)



In [12]:
funding_all = funding_all.T.reset_index()

In [13]:
funding_all = funding_all.rename(columns = {'index':'funding source', 346: 'total'})

In [14]:
#utilities.basic_bar_chart( funding_all, "funding source", "total", "funding source", "Total Funds Requested")

## What are the most popular funds, looking at amount requested?
<img src= "bar_amount requested_by_grant.png">

In [15]:
federal_state_only = cleaned_unpivoted_data[
    [
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "other_fed_funds_total",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
        "other_state_funds",
    ]
]

In [16]:
federal_state_only = (
    federal_state_only.append(
        federal_state_only.sum(numeric_only=True), ignore_index=True
    )
    .tail(1)
    .T.reset_index()
    .rename(columns={"index": "grant", 346: "amount requested"})
)



In [17]:
federal_state_only = federal_state_only.sort_values("amount requested", ascending = False)
federal_state_only["amount requested"] = federal_state_only["amount requested"].astype('int64').round(0)
federal_state_only["amount requested (millions)"] = (
    "$"
    + (federal_state_only["amount requested"].astype(float) / 1000000)
    .round(0)
    .astype(str)
    + "M"
)

In [18]:
LEGEND = ['lctop__state__funds', 'other_state_funds',
       'transit_development_act__state__funds', '_5311_funds',
       'other_fed_funds_total', '_5339_funds', '_5311_f__funds',
       '_5311_cmaq_funds', 'sb1__state_of_good_repair__state__funds']

In [19]:
#federal_state_only = utilities.fancy_bar_chart(federal_state_only, LEGEND, "grant", "amount requested", "amount requested (millions)", "Total Funds Requested")

In [20]:
federal_state_only

Unnamed: 0,grant,amount requested,amount requested (millions)
5,lctop__state__funds,247227100,$247.0M
8,other_state_funds,214945174,$215.0M
7,transit_development_act__state__funds,77680811,$78.0M
0,_5311_funds,56572117,$57.0M
4,other_fed_funds_total,35164354,$35.0M
3,_5339_funds,14814799,$15.0M
1,_5311_f__funds,4726768,$5.0M
2,_5311_cmaq_funds,2151800,$2.0M
6,sb1__state_of_good_repair__state__funds,447150,$0.0M


## Are projects fully funded? 
<b> Methodology </b>
* Totaled out "local total", "federal total", "other federal funds", and "state funds".
* Compared the total above with "total expenses" column. 
<img src='bar_fully_funded_by_number of projects.png'>

In [21]:
fully_funded = utilities.value_function(cleaned_unpivoted_data, "fully_funded")
fully_funded = fully_funded.rename(columns = {'values':'number of projects'})

In [22]:
#utilities.basic_bar_chart(fully_funded, "fully_funded", "number of projects", "fully_funded", "Projects by Fully Funded Status")

## What do organizations want to use their funding for?
* Most applications are for projects related to Operating Assistance. 
<img src= "bar_values_by_short_description.png">

In [23]:
project_types = utilities.value_function(cleaned_unpivoted_data, "short_description")

In [24]:
#utilities.basic_bar_chart(project_types, "values", "short_description", "short_description", "Project Types")

## What are the most popular combinations of programs an organization applies for?
* There are only 335 rows in this data set. There are 346 rows in the original dataframe. 
    * This is due to the fact that I had to pivot the data different and filtered out any rows with $0 below across all monetary fields and rows with values populated <b>only</b> in the state total or federal total columns.


* Looking at the top ten most popular combination of programs
    * Majority of applications only applied for LCTOP, followed by 5311.
    

In [25]:
combos = utilities.value_function(grouped_df, "all_programs")

In [26]:
Total = combos["values"].sum()
combos["percentage_of_total_applications"] = combos["values"] / Total
combos["percentage_of_total_applications"] = (
    combos["percentage_of_total_applications"].astype(float).map("{:.0%}".format)
)

In [27]:
combos.head(10)

Unnamed: 0,all_programs,values,percentage_of_total_applications
0,LCTOP (State),99,30%
1,5311 (Fed),72,21%
2,"5311 (Fed),Transit Development Act (State)",39,12%
3,5339 (Fed),29,9%
4,5311(f) (Fed),15,4%
5,"5311 (Fed),Other State Funds",14,4%
6,"5339 (Fed),Other State Funds",11,3%
7,"5311(f) (Fed),Transit Development Act (State)",9,3%
8,5311 CMAQ (Fed),6,2%
9,"5311 (Fed),SB1. State of Good Repair (State),Transit Development Act (State)",5,1%


## Analyzing applications that include LCTOP and at least one other grant: 
* 23 total applications.
* 11 different combinations of applications including LCTOP and 1+ program. 
* LCTOP and Other State Funds is the most popular combination, followed by 5311 and LCTOP.

In [28]:
lctop_combos = grouped_df[(grouped_df.all_programs.str.contains("lctop", case=False))]
lctop_combos = lctop_combos[(lctop_combos["count_of_funding_programs_applied"] > 1)]

In [29]:
lctop_combos = (
    lctop_combos.groupby(["all_programs"])
    .agg({"project_upin": "nunique"})
    .sort_values("project_upin", ascending=False)
    .reset_index()
)

In [30]:
lctop_combos = lctop_combos.rename(
    columns={
        "all_programs": "program combinations",
        "project_upin": "count of applications in this combination",
    }
)

In [31]:

lctop_combos.style.bar(subset = ['count of applications in this combination'], color = '#8CBCCB') 

Unnamed: 0,program combinations,count of applications in this combination
0,"LCTOP (State),Other State Funds",5
1,"5311 (Fed),LCTOP (State)",3
2,"5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Other State Funds",2
3,"5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Transit Development Act (State),Other State Funds",2
4,"5311 (Fed),LCTOP (State),Transit Development Act (State)",2
5,"5311 (Fed),LCTOP (State),Transit Development Act (State),Other State Funds,Other Federal Funds",2
6,"LCTOP (State),SB1. State of Good Repair (State)",2
7,"5311 (Fed),LCTOP (State),Other Federal Funds",1
8,"5311 CMAQ (Fed),LCTOP (State),Other State Funds",1
9,"LCTOP (State),Other Federal Funds",1


## How many applications did organizations submitted?
<b> Methodology </b> 
* <b> Step 1 </b>: Grouped by organization name and count the number of project upin numbers. Each upin number is unique to each application. ,
    * Example: Amador Transit applied 6 separate times.
* <b> Step 2 </b>: Group again to see how many organizations applied for 1,2, etc. applications
    * 38 organizations submited 1 application.
    * 7 organizations submitted 4 applications.
    
<img src = "bar_count of organizations_by_applications submitted.png">

In [32]:
apps_submitted = (
    cleaned_unpivoted_data.groupby(["organization_name"])
    .agg({"project_upin": "count"})
    .reset_index()
    .groupby(["project_upin"])
    .agg({"organization_name": "count"})
    .reset_index()
    .rename(
        columns={
            "project_upin": "applications submitted",
            "organization_name": "count of organizations",
        }
    )
)

In [33]:
apps_submitted['applications submitted'] = apps_submitted['applications submitted'].replace(
    {
      1: 'one',
        2:'two',
        3:'three',
        4:'four',
        5:'five',
        6:'six',
        7:'seven',
        8:'eight',
        9:'nine',
        13:'thirteen'
    }
)

In [34]:
#utilities.basic_bar_chart(apps_submitted, "count of organizations", "applications submitted", "applications submitted", "# of Organizations and the # of Applications They Submitted")

## What mix of funds did organizations apply for?
<b>Interpretation Examples</b>
* Amador Transit submitted 4 separate applications only for 5339. Submitted 2 separate applications asking for both LCTOP and 5311 funds.
* City of Escalon had 1 application, only for 5311 fund. 
    * They submitted 2 applications applying to a total of 5 programs: 5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Transit Development Act (State),Other State Funds
* Eastern Sierra Transit Authority submitted 13 different applications. 
    * 4 applications were for 5311 only.
    * 1 application for 5311f and Transit Development Act
    * 1 for 5311f, Transit Devleopment Act, and Other State Funds
    * 4 for 5339 only
    * 2 for LCTOP only
    * 1 for LCTOP and SB1 State of Good Repair.

In [35]:
organizations_combo = grouped_df.groupby(["organization_name", "all_programs"]).agg(
    {"count_of_funding_programs_applied": "max", "all_programs": "count"}
)

In [36]:
organizations_combo = organizations_combo.rename(
    columns={
        "count_of_funding_programs_applied": "# of programs in a particular application",
        "all_programs": "# of times the org submitted an application in this particular combination",
    }
)

In [37]:
organizations_combo.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,# of programs in a particular application,# of times the org submitted an application in this particular combination
organization_name,all_programs,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda-Contra Costa Transit District,LCTOP (State),1,2
Amador Transit,"5311 (Fed),LCTOP (State)",2,2
Amador Transit,5339 (Fed),1,4
Butte County Association of Governments/ Butte Regional Transit,"5311 (Fed),Transit Development Act (State)",2,2
Butte County Association of Governments/ Butte Regional Transit,"5311(f) (Fed),Transit Development Act (State)",2,1


## Which organizations applied? How much did they ask for?

In [38]:
organizations = (cleaned_unpivoted_data.groupby('organization_name')
                 .agg({'state_total':'sum','federal_total':'sum'})
                 .reset_index()
                )


In [39]:
organizations['total'] = organizations['state_total'] + organizations['federal_total']
organizations = organizations[['organization_name','total']]

In [40]:
organizations['total'] = organizations['total'].astype('int64')

In [41]:
def color(value):
    if value <  496292.00:
        color = "#E16B26"
    elif 496292.00 < value < 1435835:
        color =  "#EB9F3C"
    elif 1435836 <value < 3393431:
        color =  "#f6e7e1"
    else:
        color =  "#8CBCCB"
    return f'background-color: {color}'

In [None]:
organizations.style.applymap(color, subset=['total'])

Unnamed: 0,organization_name,total
0,Alameda-Contra Costa Transit District,8178530
1,Amador Transit,1310201
2,Butte County Association of Governments/ Butte Regional Transit,4579998
3,Calaveras Transit Agency,1954775
4,City of Arcata,13595
5,City of Arvin,4653108
6,City of Auburn,197992
7,City of Banning,40000
8,City of Beaumont,62000
9,City of California City,196912


## Analyzing Districts
* Most applications were submitted by agencies in District 6. 

In [43]:
TOOLTIP_KWARGS = {
    "min_width": 50,
    "max_width": 100,
    "font_size": "12px",
}

In [44]:
popup_dict = {
    "DISTRICT": "Caltrans District",
    "project_upin": "Total applications from organizations in this district",
    "total_state_fed_only": "Total funding requested",
}

In [45]:
tooltip_dict = {
    "DISTRICT": "Caltrans District",
    "project_upin": "Total applications from organizations in this district",
    "total_state_fed_only": "Total funding requested",
}

In [46]:
color_scale_percentiles = branca.colormap.StepColormap(
    colors=["#F6BF16", "#E16B26", "#00896B"],
    index=[0, 25, 60],
    vmin=0,
    vmax=100,
)
color_scale_percentiles

In [47]:
REGION_CENTROIDS = shared_utils.map_utils.REGION_CENTROIDS