In [1]:
import numpy as np
import pandas as pd
import TIRCP_functions
import shared_utils
import altair as alt
import altair_saver
from shared_utils import altair_utils 
from statistics import mode
from collections import Counter

pd.options.display.max_rows = 100
pd.set_option('display.max_colwidth', None)

pd.options.display.float_format = "{:0f}".format
pd.options.display.float_format = '{:,}'.format



In [2]:
df = TIRCP_functions.tableau()

In [3]:
df.columns

Index(['Award_Year', 'Project_#', 'Local_Agency', 'Vendor_ID_#',
       'Project_Title', 'District', 'County', 'Key_Project_Elements',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator',
       'Technical_Assistance-CALTP_(Y/N)', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_Amount', 'Allocated_Amount', 'Unallocated_amt_project_sheet',
       'Percentge_Allocated', 'Expended_Amount', 'Other_Funds_Involved',
       'Award_Cycle', 'Local_Agency_Address', 'Local_Agency_City',
       'Local_Agency_Zip', 'Local_Agency_Contact', 'Local_Agency_Email',
       'Local_Agency_Phone_Number', 'Comments/Additional_Contacts', 'PPNO',
       'Expended_Percent', 'Allocated_Percent', 'Expended_Percent_Group',
       'Progress', 'Project_Category'],
      dtype='object')

In [4]:
df.head(1)

Unnamed: 0,Award_Year,Project_#,Local_Agency,Vendor_ID_#,Project_Title,District,County,Key_Project_Elements,Master_Agreement_Number,Master_Agreement_Expiration_Date,...,Local_Agency_Contact,Local_Agency_Email,Local_Agency_Phone_Number,Comments/Additional_Contacts,PPNO,Expended_Percent,Allocated_Percent,Expended_Percent_Group,Progress,Project_Category
0,2015,1,Antelope Valley Transit Authority (AVTA),TBD,Regional Transit Interconnectivity & Environmental Sustability,7,LA,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,64AVTA2015MA,2024-04-01,...,Judy Fry,Jfry@avta.com,(611) 729-2234,,CP005,0.889815905011679,1.0,71-100,On Track,Medium


## 1. What % of projects are on time?

In [5]:
df1 = df.groupby(['Progress']).agg({'PPNO':'count'}).rename(columns = {'PPNO':'Count_of_Projects', 'TIRCP_Amount':'Sum_of_TIRCP'}) 
df1['Percentage_of_Projects'] = ((100* df1['Count_of_Projects']/(df1['Count_of_Projects']).sum())).round(0)
sort1 = ['No Expenditures', 'Behind','On Track','Ahead'] 
df1 = df1.loc[sort1].reset_index()
df1

Unnamed: 0,Progress,Count_of_Projects,Percentage_of_Projects
0,No Expenditures,43,58.0
1,Behind,5,7.0
2,On Track,21,28.0
3,Ahead,5,7.0


In [6]:
chart_progress = TIRCP_functions.basic_bar_chart(df1, 'Progress','Percentage_of_Projects', 'Progress') 
chart_progress

## 2. Progress of bigger projects. HELP, how to re-arrange from small -> med -> large

In [7]:
df2 = df.groupby(['Project_Category', 'Progress']).agg({'PPNO':'count', 'TIRCP_Amount':'sum'}).rename(columns = {'PPNO':'Count_of_Projects', 'TIRCP_Amount':'Sum_of_TIRCP'}) 
df2['Percentage_of_Projects'] = ((100* df2['Count_of_Projects']/(df2['Count_of_Projects']).sum())).round(0)
sort2 = ['Small', 'Medium','Large'] 
df2 = df2.loc[sort2].reset_index()
df2

Unnamed: 0,Project_Category,Progress,Count_of_Projects,Sum_of_TIRCP,Percentage_of_Projects
0,Small,Behind,1,1675000.0,1.0
1,Small,No Expenditures,11,49645000.0,15.0
2,Small,On Track,4,15301000.0,5.0
3,Medium,Ahead,3,79617000.0,4.0
4,Medium,Behind,2,19999000.0,3.0
5,Medium,No Expenditures,20,404971000.0,27.0
6,Medium,On Track,11,210604000.0,15.0
7,Large,Ahead,2,580840000.0,3.0
8,Large,Behind,2,469209000.0,3.0
9,Large,No Expenditures,10,2634906000.0,14.0


In [8]:
chart_project_size2 = TIRCP_functions.basic_bar_chart(df2, 'Project_Category','Count_of_Projects', 'Progress') 
chart_project_size2

In [9]:
chart_project_size3 = TIRCP_functions.basic_bar_chart(df2, 'Project_Category','Percentage_of_Projects', 'Progress') 
chart_project_size3

## 3. Which organizations received the most $?

In [10]:
df3 = df.groupby(['Local_Agency']).agg({'PPNO':'count', 'TIRCP_Amount':'sum'}).reset_index().rename(columns = {'PPNO':'Count_of_Projects','TIRCP_Amount':'TIRCP_Received'}) 
df3['Percentage_of_Projects'] = ((100* df3['Count_of_Projects']/(df3['Count_of_Projects']).sum())).round(0)
df3= df3.sort_values('TIRCP_Received').tail(5)
df3

Unnamed: 0,Local_Agency,Count_of_Projects,TIRCP_Received,Percentage_of_Projects
3,Bay Area Rapid Transit District (BART),2,425700000.0,3.0
27,San Joaquin Joint Powers Authority (SJJPA) & San Joaquin Regional Rail Commission (SJRRC),1,500500000.0,1.0
32,Santa Clara Valley Transportation Authority,2,750000000.0,3.0
37,Southern California Regional Rail Authority,2,916889000.0,3.0
15,Los Angeles County Metropolitan Transportation Authority,4,1236202000.0,5.0


In [11]:
org_most_money = TIRCP_functions.basic_bar_chart(df3, 'Local_Agency','TIRCP_Received','Local_Agency') 
org_most_money

## 4. Project Type
* [Picking out keywords](https://www.geeksforgeeks.org/python-program-for-most-frequent-word-in-strings-list/)
* [TIRCP](https://calsta.ca.gov/subject-areas/transit-intercity-rail-capital-prog)
* [2018 Projects](https://calsta.ca.gov/-/media/calsta-media/documents/2018-tircp-detailed-project-award-announcement.pdf)

<b> Goals of TIRCP </b>
1. Modernize California's transit. 
2. Reduce emissions of greenhouse gases
3. Expand and improve transit service to increase ridership
4. Integrate the rail service of the state’s various rail operations, including integration with the high‐speed rail system
5. Improve transit safety



In [12]:
#convert all key project elements to list to read through
project_elements_list = df['Key_Project_Elements'].unique().tolist()

In [14]:
#Modernizing transit
MODERNIZE = ['install','communications upgrade', 'construct tracks', 'infrastructure', 
               'extension', 'lengthens platforms','modernize', 'expanding', 'network integration']
#Reducing environmental footprint
ENVIRONMENT = ['electric','zero-emission', 'Reduce Emissions', 
               'battery','hydrogen fuel-cell',
              'clean','emissions','emission','greenhouse gas']
#Encouraging more ridership & improving riders' experiences
EXPANSION = ['passengers', 'increase use of transit', 'increase transit service', 
             'expand service','service expansion', 'expansion','increase frequencies', 
             'customer focused', 'frequency', 'parking spaces','on-time performance',
           'capacity-increasing', 'Increase ridership','mobility','limited access',
           ]
#Safety
SAFETY = ['safety','safe','overcrowding']

#Expanding transit & integrating different geographies/transit systems together
INTEGRATION = ['connecting','linking', 'seamless', 'service extension','connections', 'extend',
            'transit-only', 'expand services', 'extend','extended', 'expansion','expands','link','track']
#Purchasing vehicles
VEHICLES = ['bus','rail', 'light rail vehicles', 'trolley vehicles', 'locomotives', 
            'vanpool', 'electrification', 'streetcar', 'LRV', 'low-floor rail vehicles',
           'Zero Emission Multiple Unit (ZEMU) train','ZEMU', 'buses', 'powered automated people mover (APM)']


In [15]:
def categorize_project_descriptions(row):
    """
    This function takes a individual type of work description (row of a dataframe)
    and returns a dummy flag of 1 if it finds keyword present in
    project categories (active transportation, transit, bridge, etc).
    A description can contain multiple keywords across categories.
    """
    # Make lowercase
    description = row.Key_Project_Elements.lower()
    
    # Store a bunch of columns that will be flagged
    # A project can involve multiple things...also, not sure what's in the descriptions
    environment = 0
    expansion = 0
    safety = 0
    integration = 0
    expansion = 0
    vehicles = 0

    if any(word in description for word in ENVIRONMENT):
        environment = 1
    if any(word in description for word in EXPANSION):
        ridership = 1
    if any(word in description for word in MODERNIZE):
        modernize = 1
    if any(word in description for word in SAFETY):
        safety = 1
    if any(word in description for word in INTEGRATION):
        integration = 1
    if any(word in description for word in VEHICLES):
        vehicles = 1        
   
    return pd.Series(
        [environment, expansion, safety, integration, expansion, vehicles], 
        index=['environment', 'modernize', 'expansion', 'vehicles','safety','integration']
    )

In [16]:
project_categories = df.apply(categorize_project_descriptions, axis=1)

In [17]:
keyword_df = pd.concat([df, project_categories], axis=1)

### Most projects belonged in 2 categories, followed by 1. 

In [19]:
project_cols = list(project_categories.columns)

In [25]:
keyword_df = keyword_df.assign(
    project_categories = keyword_df[project_cols].sum(axis=1)
)

keyword_df.project_categories.value_counts()

2    29
1    25
3    15
0     4
4     1
Name: project_categories, dtype: int64

In [23]:
keyword_df[['Award_Year','Key_Project_Elements', 'environment', 'modernize', 'expansion', 'vehicles','safety','integration','project_categories']]

Unnamed: 0,Award_Year,Key_Project_Elements,environment,modernize,expansion,vehicles,safety,integration,project_categories
0,2015,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,1,0,0,0,0,1,2
1,2015,"Track and curve improvements between San Jose and Martinez for faster journeys benefiting Capitol Corridor, ACE, and San Joaquins passengers",0,0,0,1,0,0,1
2,2015,"Replace Blue Line signal system; install new track crossovers, new train controls at 15 locations, new LED signals and power switches, 19 turnouts, new track, overhead catenary, and a communications upgrade.",0,0,0,1,0,0,1
3,2015,LOSSAN and 12 transit agencies from San Luis Obispo to San Diego counties to use seamless ticketing to increase use of transit,0,0,0,1,0,0,1
4,2015,"Renovation and expansion of the Monterey maintenance and operations facility, Reduced Deadheads for 30 buses, increased transit service connecting East Salinas and the Salinas Intermodal Station",0,0,0,1,0,1,2
5,2015,Purchase five 40-foot CNG buses for BRT Route linking SARTC to Metrolink/Amtrak,0,0,0,1,0,1,2
6,2015,Refurbishment of Seven UTDC Light Rail Vehicles Used Throughout the System,0,0,0,0,0,1,1
7,2015,Bus Rapid Transit Route between Downtown San Diego and the Otay Mesa Crossing,0,0,0,0,0,1,1
8,2015,"Courthouse Trolley Station (Design), Courthouse Trolley Station (Construction), Purchase 8 trolley vehicles to expand service on the Blue and Orange lines",0,0,0,0,0,1,1
9,2015,Purchase 8 Zero-emission Light Rail Vehicles for MUNI Service Expansion,1,0,0,1,0,1,3


In [42]:
df7 = keyword_df.groupby(['environment', 'modernize', 'expansion', 'vehicles','safety','integration','project_categories']).agg({'PPNO':'count', 'TIRCP_Amount':'sum'}).reset_index().rename(columns = {'PPNO':'Count_of_Projects', 'TIRCP_Amount':'Sum_of_TIRCP'})

In [47]:
df7 = df7.sort_values('project_categories', ascending = False)
df7

Unnamed: 0,environment,modernize,expansion,vehicles,safety,integration,project_categories,Count_of_Projects,Sum_of_TIRCP
8,1,0,1,1,0,1,4,1,13009000.0
7,1,0,0,1,0,1,3,15,516079000.0
3,0,0,0,1,0,1,2,15,2792518000.0
5,1,0,0,0,0,1,2,12,256352000.0
6,1,0,0,1,0,0,2,2,24060000.0
1,0,0,0,0,0,1,1,16,1117457000.0
2,0,0,0,1,0,0,1,8,897888000.0
4,1,0,0,0,0,0,1,1,200000.0
0,0,0,0,0,0,0,0,4,146931000.0


In [46]:
project_description = TIRCP_functions.basic_bar_chart(df7, 'Count_of_Projects','Sum_of_TIRCP','project_categories') 
project_description

## 5. Projects with the most TIRCP funding with 0 recorded expenditures

In [26]:
df4 = df.loc[df['Progress'] == 'No Expenditures'] 

In [27]:
df5 = df4.groupby(['Local_Agency','Project_Title','Award_Year']).agg({'TIRCP_Amount':'sum'}).reset_index().rename(columns = {'PPNO':'Count_of_Projects','TIRCP_Amount':'TIRCP_Received', 'Project_Title':'Projects'}) 

In [28]:
#Looking at projects with no expenditures 
df6 = df5.sort_values('TIRCP_Received', ascending = False).head(5)
df6

Unnamed: 0,Local_Agency,Projects,Award_Year,TIRCP_Received
13,Los Angeles County Metropolitan Transportation Authority,Los Angeles Region Transit System Integration and Modernization Program of Projects,2018,1088499000.0
36,Santa Clara Valley Transportation Authority,"VTA’s BART Silicon Valley Extension, Phase II",2018,730000000.0
2,Bay Area Rapid Transit District (BART),The Transbay Corridor Core Capacity Program: Vehicle Acquistion and Communications-Based Train Control System,2018,318600000.0
1,Bay Area Rapid Transit District (BART),The Transbaby Corridor Core Capacity Program: Vehicle Acquisition,2020,107100000.0
9,"LA County Metropolitan Transportation Authority, So Cal Regional Rail Authority (Metrolink)",Metrolink Antelope Valley Line Capital and Service Improvements,2020,107050000.0


In [29]:
projects_no_expenditures = TIRCP_functions.basic_bar_chart(df6, 'Projects','TIRCP_Received','Local_Agency') 
projects_no_expenditures

## 6. TIRCP funding by districts
* Change districts temporarily to all strings

In [30]:
df8 = df.groupby(['District']).agg({'PPNO':'count', 'TIRCP_Amount':'sum'}).rename(columns = {'PPNO':'Count_of_Projects', 'TIRCP_Amount':'Sum_of_TIRCP'}).reset_index()

In [31]:
df8 = df8.sort_values('Sum_of_TIRCP', ascending = False)
df8

Unnamed: 0,District,Count_of_Projects,Sum_of_TIRCP
5,7,17,2458538000.0
2,4,14,868506000.0
10,VAR,8,690852000.0
7,10,4,410153000.0
8,11,6,175947000.0
1,3,5,128291000.0
9,12,3,58937000.0
6,8,3,54204000.0
3,5,3,32609000.0
4,6,2,15798000.0


In [32]:
District = TIRCP_functions.basic_scatter_chart(df8, 'District','Sum_of_TIRCP','District') 
District