# Analyzing

In [1]:
import re as re

import geopandas as gpd
import numpy as np
import pandas as pd

pd.options.display.max_columns = 50
pd.options.display.max_rows = 250
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

from collections import Counter
from itertools import chain, combinations

from calitp import *
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/consolidated_applications/"



In [2]:
FILE_NAME = "Con_App_Cleaned.xlsx"

In [3]:
pivoted_data = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="pivoted_data")
cleaned_unpivoted_data = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="cleaned_unpivoted_data"
)
grouped_df = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="combos_of_funding_programs"
)

## Districts
* Most applications were submitted by agencies in District 6. 

In [4]:
cleaned_unpivoted_data.full_district_name.value_counts()

District 6: Fresno / Bakersfield               53
District 4: Bay Area / Oakland                 47
District 3: Marysville / Sacramento            42
District 2: Redding                            37
District 8: San Bernardino / Riverside         33
District 5: San Luis Obispo / Santa Barbara    32
District 1: Eureka                             30
District 10: Stockton                          28
District 9: Bishop                             19
District 7: Los Angeles                        15
District 11: San Diego                          8
District 12: Orange County                      2
Name: full_district_name, dtype: int64

## Project Types
* Most applications are for projects related to Operating Assistance. 

In [5]:
cleaned_unpivoted_data.short_description.value_counts()

Operating Assistance      204
Purchasing Vehicles        89
Construction               16
Maintenance/Renovation      8
Purchasing Other Tech       7
Free Fare Program           7
Other Category              6
Service Expansion           5
Transit Planning            3
Ridership Expansion         1
Name: short_description, dtype: int64

## Looking at all combos of programs.
* Majority of applications only applied for LCTOP funds. 
* There are only 335 rows in this data set, down from 346 rows. This is due to the fact that I had to pivot the data different and filtered out any rows with $0 below,  in order to count how many programs an application included.
    * '5311 (Fed)', '5311(f) (Fed)', '5311 CMAQ (Fed)', '5339 (Fed)', 'LCTOP (State)', 'SB1. State of Good Repair (State)','Transit Development Act (State)', 'Other State Funds','Other Federal Funds'

In [6]:
combos = grouped_df.all_programs.value_counts().to_frame().reset_index().rename(
        columns={"index": "combo of programs an organization applied for in one application", "all_programs": "total applications"}
    )

In [20]:
Total = combos['total applications'].sum()

In [8]:
combos['percentage_of_total_applications'] = combos['total applications']/Total

In [9]:
combos

Unnamed: 0,combo of programs an organization applied for in one application,total applications,percentage_of_total
0,LCTOP (State),99,0.3
1,5311 (Fed),72,0.21
2,"5311 (Fed),Transit Development Act (State)",39,0.12
3,5339 (Fed),29,0.09
4,5311(f) (Fed),15,0.04
5,"5311 (Fed),Other State Funds",14,0.04
6,"5339 (Fed),Other State Funds",11,0.03
7,"5311(f) (Fed),Transit Development Act (State)",9,0.03
8,5311 CMAQ (Fed),6,0.02
9,"5311 (Fed),SB1. State of Good Repair (State),Transit Development Act (State)",5,0.01


## 23 applications include LCTOP and at least 1+ other program.
* 11 different combinations of applications including LCTOP and 1+ program. 
* LCTOP and Other State Funds is the most popular combination, followed by 5311 and LCTOP.

In [10]:
lctop_combos = grouped_df[(grouped_df.all_programs.str.contains("lctop", case=False))]

In [11]:
lctop_combos = lctop_combos[(lctop_combos["count_of_funding_programs_applied"] > 1)]

In [12]:
lctop_combos = (
    lctop_combos.groupby(["all_programs"])
    .agg({"organization_name": "count"})
    .sort_values("organization_name", ascending=False)
    .reset_index()
)

In [13]:
lctop_combos.loc["Grand_total"] = lctop_combos.sum(numeric_only=True)

In [14]:
lctop_combos = lctop_combos.rename(
    columns={
        "all_programs": "program_combos",
        "organization_name": "count of orgs who applied for this combo",
    }
)

In [15]:
lctop_combos

Unnamed: 0,program_combos,count of orgs who applied for this combo
0,"LCTOP (State),Other State Funds",5.0
1,"5311 (Fed),LCTOP (State)",3.0
2,"5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Other State Funds",2.0
3,"5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Transit Development Act (State),Other State Funds",2.0
4,"5311 (Fed),LCTOP (State),Transit Development Act (State)",2.0
5,"5311 (Fed),LCTOP (State),Transit Development Act (State),Other State Funds,Other Federal Funds",2.0
6,"LCTOP (State),SB1. State of Good Repair (State)",2.0
7,"5311 (Fed),LCTOP (State),Other Federal Funds",1.0
8,"5311 CMAQ (Fed),LCTOP (State),Other State Funds",1.0
9,"LCTOP (State),Other Federal Funds",1.0


## How many applications on average organizations submitted.
* Each organization submitted on average 2.85 separate applications. 

In [22]:
apps_submitted = cleaned_unpivoted_data.groupby(['organization_name']).agg({'project_upin':'count'})

In [27]:
apps_submitted['project_upin'].mean()

2.8595041322314048

## Looking at programs organizations applied to
* I consider each unique project_upin to be a unique application submitted by an agency. 

<b>Some Insights</b>
* Amador Transit submitted 4 separate applications only for 5339. Submitted 2 separate applications asking for both LCTOP and 5311 funds.
* City of Escalon had 1 application, only for 5311 fund. 
    * They submitted 2 applications applying to a total of 5 programs: 5311 (Fed),LCTOP (State),SB1. State of Good Repair (State),Transit Development Act (State),Other State Funds
* Eastern Sierra Transit Authority submitted 13 different applications. 
    * 4 applications were for 5311 only.
    * 1 application for 5311f and Transit Development Act
    * 1 for 5311f, Transit Devleopment Act, and Other State Funds
    * 4 for 5339 only
    * 2 for LCTOP only
    * 1 for LCTOP and SB1 State of Good Repair.

In [16]:
organizations_combo = grouped_df.groupby(["organization_name", "all_programs"]).agg(
    {"count_of_funding_programs_applied": "max", "all_programs": "count"}
)

In [17]:
organizations_combo = organizations_combo.rename(
    columns={
        "count_of_funding_programs_applied": "# of programs in a particular application",
        "all_programs": "# of times the org submitted an application with this particular combination",
    }
)

In [18]:
organizations_combo

Unnamed: 0_level_0,Unnamed: 1_level_0,# of programs in a particular application,# of times the org submitted an application with this particular combination
organization_name,all_programs,Unnamed: 2_level_1,Unnamed: 3_level_1
Alameda-Contra Costa Transit District,LCTOP (State),1,2
Amador Transit,"5311 (Fed),LCTOP (State)",2,2
Amador Transit,5339 (Fed),1,4
Butte County Association of Governments/ Butte Regional Transit,"5311 (Fed),Transit Development Act (State)",2,2
Butte County Association of Governments/ Butte Regional Transit,"5311(f) (Fed),Transit Development Act (State)",2,1
Calaveras Transit Agency,"5311 (Fed),Transit Development Act (State)",2,2
Calaveras Transit Agency,LCTOP (State),1,1
City of Arcata,LCTOP (State),1,1
City of Arvin,"5311 (Fed),Transit Development Act (State)",2,2
City of Arvin,5339 (Fed),1,3


In [19]:
#test = grouped_df.loc[grouped_df["organization_name"] == "Eastern Sierra Transit Authority"]