In [1]:
import numpy as np
import pandas as pd
import TIRCP_functions
import shared_utils
import altair as alt
import altair_saver
from shared_utils import altair_utils 
from statistics import mode
from collections import Counter

pd.options.display.max_rows = 100
pd.set_option('display.max_colwidth', None)

pd.options.display.float_format = "{:0f}".format
pd.options.display.float_format = '{:,}'.format



In [2]:
df = TIRCP_functions.tableau()

In [3]:
df.columns

Index(['Award_Year', 'Project_#', 'Local_Agency', 'Vendor_ID_#',
       'Project_Title', 'District', 'County', 'Key_Project_Elements',
       'Master_Agreement_Number', 'Master_Agreement_Expiration_Date',
       'Project_Manager', 'Regional_Coordinator',
       'Technical_Assistance-CALTP_(Y/N)', 'Technical_Assistance-Fleet_(Y/N)',
       'Technical_Assistance-Network_Integration_(Y/N)',
       'Technical_Assistance-Priority_Population_(Y/N)', 'Total_Project_Cost',
       'TIRCP_Amount', 'Allocated_Amount', 'Unallocated_amt_project_sheet',
       'Percentge_Allocated', 'Expended_Amount', 'Other_Funds_Involved',
       'Award_Cycle', 'Local_Agency_Address', 'Local_Agency_City',
       'Local_Agency_Zip', 'Local_Agency_Contact', 'Local_Agency_Email',
       'Local_Agency_Phone_Number', 'Comments/Additional_Contacts', 'PPNO',
       'Expended_Percent', 'Allocated_Percent', 'Expended_Percent_Group',
       'Progress', 'Project_Category'],
      dtype='object')

## 1. What % of projects are on track? HELP, how to add % to graph?
* 28% of projects are on track by our measures. 
* However, 58% of projects have NO expenditure. 


In [4]:
df1 = df.groupby(['Progress']).agg({'PPNO':'count'}).rename(columns = {'PPNO':'Count_of_Projects', 'TIRCP_Amount':'Sum_of_TIRCP'}) 
df1['Percentage'] = ((100* df1['Count_of_Projects']/(df1['Count_of_Projects']).sum())).round(0)

df1 = df1.reset_index().sort_values('Percentage')
df1

Unnamed: 0,Progress,Count_of_Projects,Percentage
0,Ahead,5,7.0
1,Behind,5,7.0
3,On Track,21,28.0
2,No Expenditures,43,58.0


In [5]:
chart_progress = TIRCP_functions.basic_bar_chart(df1, 'Progress','Percentage', 'Progress') 
chart_progress

## 2. How is the progress for medium vs small vs large projects? 
* Used describe() to find percentiles for TIRCP amount column. 
* Most projects are medium (36), followed by large (19), small (16), and $0 in TIRCP amount (3). 
* Most projects are "medium" with no expenditures. 

In [6]:
df.groupby('Project_Category').agg({'PPNO':'count'})

Unnamed: 0_level_0,PPNO
Project_Category,Unnamed: 1_level_1
$0 recorded for TIRCP,3
Large,19
Medium,36
Small,16


* Most small projects had no expenditures. NONE of the msall projects are on track.
* Most medium projects had no expenditures, followed by on track.
* Most large projects had no expenditures, followed by on track.


In [7]:
df2 = (df.groupby(['Project_Category', 'Progress'])
       .agg({'PPNO':'count', 'TIRCP_Amount':'sum'})
       .rename(columns = {'PPNO':'Number_of_Projects', 'TIRCP_Amount':'Sum_of_TIRCP'}))
df2['Percentage'] = ((100* df2['Number_of_Projects']/(df2['Number_of_Projects']).sum())).round(0)
sort2 = ['Small', 'Medium','Large'] 
df2 = df2.loc[sort2].reset_index()
df2

Unnamed: 0,Project_Category,Progress,Number_of_Projects,Sum_of_TIRCP,Percentage
0,Small,Behind,1,1675000.0,1.0
1,Small,No Expenditures,11,49645000.0,15.0
2,Small,On Track,4,15301000.0,5.0
3,Medium,Ahead,3,79617000.0,4.0
4,Medium,Behind,2,19999000.0,3.0
5,Medium,No Expenditures,20,404971000.0,27.0
6,Medium,On Track,11,210604000.0,15.0
7,Large,Ahead,2,580840000.0,3.0
8,Large,Behind,2,469209000.0,3.0
9,Large,No Expenditures,10,2634906000.0,14.0


In [8]:
chart_project_size4 = TIRCP_functions.basic_bar_chart(df2, 'Progress','Percentage', 'Project_Category') 
chart_project_size4

In [9]:
chart_project_size2 = TIRCP_functions.basic_bar_chart(df2, 'Project_Category','Number_of_Projects', 'Progress') 
chart_project_size2

## 3. Which organizations received the most $ across all the cycles?

* LA Metro received the most ($1.2 billion). 

In [10]:
df3 = (df.groupby(['Local_Agency', 'District'])
       .agg({'PPNO':'count', 'TIRCP_Amount':'sum'})
       .reset_index()
       .rename(columns = {'PPNO':'Number_of_Projects','TIRCP_Amount':'TIRCP_Received'}))
df3= df3.sort_values('TIRCP_Received').tail(5)
df3

Unnamed: 0,Local_Agency,District,Number_of_Projects,TIRCP_Received
3,Bay Area Rapid Transit District (BART),District 4: Bay Area / Oakland,1,318600000.0
28,San Joaquin Regional Rail Commission / San Joaquin Joint Powers Authority,District 10: Stockton,2,400200000.0
27,San Joaquin Joint Powers Authority (SJJPA) & San Joaquin Regional Rail Commission (SJRRC),Various,1,500500000.0
34,Southern California Regional Rail Authority,District 7: Los Angeles,2,916889000.0
16,Los Angeles County Metropolitan Transportation Authority,District 7: Los Angeles,4,1236202000.0


In [11]:
org_most_money = TIRCP_functions.basic_bar_chart(df3, 'Local_Agency','TIRCP_Received','Local_Agency') 
org_most_money

## 4. Project Type - looking at the key project elements column & searching for keywords within that column to figure out which of the goal(s) a particular project meets.
<b> Goals of TIRCP </b>
1. Modernize California's transit. 
2. Reduce emissions of greenhouse gases
3. Expand and improve transit service to increase ridership
4. Integrate the rail service of the state’s various rail operations, including integration with the high‐speed rail system
5. Improve transit safety

<b> Resources </b>

* [Picking out keywords](https://www.geeksforgeeks.org/python-program-for-most-frequent-word-in-strings-list/)
* [TIRCP](https://calsta.ca.gov/subject-areas/transit-intercity-rail-capital-prog)
* [2018 Projects](https://calsta.ca.gov/-/media/calsta-media/documents/2018-tircp-detailed-project-award-announcement.pdf)


In [12]:
#convert all key project elements to list to read through
project_elements_list = df['Key_Project_Elements'].unique().tolist()

In [13]:
project_elements_list

['Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses',
 'Track and curve improvements between San Jose and Martinez for faster journeys benefiting Capitol Corridor, ACE, and San Joaquins passengers',
 'Replace Blue Line signal system; install new track crossovers, new train controls at 15 locations, new LED signals and power switches, 19 turnouts, new track, overhead catenary, and a communications upgrade.',
 'LOSSAN and 12 transit agencies from San Luis Obispo to San Diego counties to use seamless ticketing to increase use of transit',
 'Renovation and expansion of the Monterey maintenance and operations facility, Reduced Deadheads for 30 buses, increased transit service connecting East Salinas and the Salinas Intermodal Station',
 'Purchase five 40-foot CNG buses for BRT Route linking SARTC to Metrolink/Amtrak',
 'Refurbishment of Seven UTDC Light Rail Vehicles Used Throughout the System',
 'Bus Rapid Transit Route between Downtown San Diego and the Otay

In [14]:
#1
ENVIRONMENT = ['electric','zero-emission', 'Reduce Emissions', 
               'battery','hydrogen fuel-cell',
              'clean','emissions','emission','greenhouse gas','Zero Emission Multiple Unit (ZEMU) train','ZEMU']
#2
EXPANSION = ['passengers', 'increase use of transit', 'increase transit service','transit service',
             'customer', 'articulated', 'BRT', 'commuter',
             'expand service','service expansion', 'expansion','increase frequencies', 
             'customer focused', 'frequency', 'parking spaces','on-time performance',
           'capacity-increasing', 'ridership','mobility','limited access','bus','rail', 'light rail vehicles', 'trolley vehicles', 'locomotives', 
            'vanpool', 'electrification', 'streetcar', 'LRV', 'low-floor rail vehicles', 'lrv', 'lrvs',
           'Zero Emission Multiple Unit (ZEMU) train','ZEMU', 'buses', 'powered automated people mover (APM)',
             'LRVs', 'more frequent service','bike',
           ]

#3
SAFETY = ['safety','safe','overcrowding']

#4
MODERNIZE = ['install','communications upgrade', 'construct tracks', 'infrastructure', 
               'extension', 'lengthens platforms','modernize', 'expanding', 'network integration','new']

#5
INTEGRATION = ['connecting','linking', 'seamless', 'service extension','connections', 'extend',
            'transit-only', 'expand services', 'extend','extended', 'expansion','expands','link','track']


In [15]:
def categorize_project_descriptions(row):
    """
    This function takes a individual type of work description (row of a dataframe)
    and returns a dummy flag of 1 if it finds keyword present in
    project categories (active transportation, transit, bridge, etc).
    A description can contain multiple keywords across categories.
    """
    # Make lowercase
    description = row.Key_Project_Elements.lower()
    
    #Flagging columns. If a project doesn't have any of the keywords, flag as 0. If it does, flag as 1.
    environment = 0
    expansion = 0
    safety = 0
    modernize = 0
    integration = 0
    
    if any(word in description for word in ENVIRONMENT):
        environment = 1
    if any(word in description for word in EXPANSION):
        expansion = 1
    if any(word in description for word in SAFETY):
        safety = 1
    if any(word in description for word in MODERNIZE):
        modernize = 1
    if any(word in description for word in INTEGRATION):
        integration = 1
   # Create new cols out of our categories.
    return pd.Series(
        [environment, expansion, safety, modernize, integration], 
        index=['environment', 'expansion', 'safety', 'modernize','integration']
    )

In [16]:
#New DF with the environment/expansion/etc flagged.
project_categories = df.apply(categorize_project_descriptions, axis=1)

In [17]:
#Concating the 2 dataframes together, original and the one just created
keyword_df = pd.concat([df, project_categories], axis=1)

### Most projects meet 2 goals, followed by 1. 
* 1 project met all five goals.
* Most projects met 2 goals, followed by 3. 

In [18]:
#List of all our new cols & get value_counts for how many projects meet 1 or 2 or etc goals.
project_cols = list(project_categories.columns)

keyword_df = keyword_df.assign(
    project_categories = keyword_df[project_cols].sum(axis=1)
)

keyword_df.project_categories.value_counts()

2    27
3    24
1    13
4     9
5     1
Name: project_categories, dtype: int64

In [19]:
#get only columns we are most interested in 
keyword_df2 = keyword_df[['Project_Title','PPNO','Award_Year', 'TIRCP_Amount','Key_Project_Elements', 'environment', 'expansion', 'safety', 'modernize','integration','project_categories']]

In [20]:
#duplicate a TIRCP amount so we can grab mean
keyword_df2['TIRCP_Amount_Mean'] = keyword_df2['TIRCP_Amount'] 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [21]:
keyword_df2 = (keyword_df2.groupby(['environment', 'expansion', 'safety',
                     'modernize','integration','project_categories'])
               .agg({'PPNO':'nunique','TIRCP_Amount':'sum','TIRCP_Amount_Mean':'mean'}).reset_index())
keyword_df2.sort_values('PPNO')

Unnamed: 0,environment,expansion,safety,modernize,integration,project_categories,PPNO,TIRCP_Amount,TIRCP_Amount_Mean
6,1,0,0,1,0,2,1,200000.0,200000.0
7,1,0,0,1,1,3,1,9060000.0,9060000.0
12,1,1,1,1,1,5,1,13009000.0,13009000.0
1,0,0,0,1,1,2,2,58494000.0,29247000.0
0,0,0,0,0,1,1,3,103099000.0,34366333.333333336
8,1,1,0,0,0,2,4,73934000.0,18483500.0
9,1,1,0,0,1,3,7,136293000.0,19470428.57142857
5,0,1,0,1,1,3,8,2423393000.0,302924125.0
10,1,1,0,1,0,3,8,182418000.0,22802250.0
2,0,1,0,0,0,1,9,236203000.0,23620300.0


### Looking at the most common goals the TIRCP projects met. 
* 10 projects met the goals of expansion & integration.
* 9 projects met 4/5 goals. 9 projects met expansion & modernize. 

In [22]:
keyword_projects_total = keyword_df2.sort_values('PPNO').tail(5).rename(columns = {'PPNO':'Total_Projects'}) 

In [23]:
Categories1 = ['Expansion, modernize, & integration', 
               'Environment, expansion, & modernize', 
               'Expansion & Modernize', 
               'Environment, expansion, modernize, & integration', 
               'Expansion & Integration']
keyword_projects_total['Goals_Met'] = Categories1

In [24]:
keyword_projects_total

Unnamed: 0,environment,expansion,safety,modernize,integration,project_categories,Total_Projects,TIRCP_Amount,TIRCP_Amount_Mean,Goals_Met
10,1,1,0,1,0,3,8,182418000.0,22802250.0,"Expansion, modernize, & integration"
2,0,1,0,0,0,1,9,236203000.0,23620300.0,"Environment, expansion, & modernize"
4,0,1,0,1,0,2,9,1028185000.0,102818500.0,Expansion & Modernize
11,1,1,0,1,1,4,9,394786000.0,43865111.11111111,"Environment, expansion, modernize, & integration"
3,0,1,0,0,1,2,10,1105420000.0,110542000.0,Expansion & Integration


In [25]:
Project_goals_total_projects = TIRCP_functions.basic_bar_chart(keyword_projects_total, 'Goals_Met','Total_Projects','Goals_Met') 
Project_goals_total_projects

### Looking at the top 5 projects by TIRCP received.
* The project type that receieved the most TIRCP funds meet the goals of expansion, modernize, and integration. 
* Followed by projects that met expansion & integration. 

In [26]:
keyword_df3 = keyword_df2.sort_values('TIRCP_Amount').tail(5)

In [27]:
Categories2 = ['Expansion, expansion, & modernize', 'Environment, expansion, modernize, & integration', 'Expansion & Modernize', 'Expansion & Integration', 'Expansion, Modernize & Integration']

In [28]:
keyword_df3['Project_Goals'] = Categories2
keyword_df3

Unnamed: 0,environment,expansion,safety,modernize,integration,project_categories,PPNO,TIRCP_Amount,TIRCP_Amount_Mean,Project_Goals
2,0,1,0,0,0,1,9,236203000.0,23620300.0,"Expansion, expansion, & modernize"
11,1,1,0,1,1,4,9,394786000.0,43865111.11111111,"Environment, expansion, modernize, & integration"
4,0,1,0,1,0,2,9,1028185000.0,102818500.0,Expansion & Modernize
3,0,1,0,0,1,2,10,1105420000.0,110542000.0,Expansion & Integration
5,0,1,0,1,1,3,8,2423393000.0,302924125.0,"Expansion, Modernize & Integration"


In [29]:
Project_Goals_TIRCP = TIRCP_functions.basic_bar_chart(keyword_df3, 'Project_Goals','TIRCP_Amount','Project_Goals') 
Project_Goals_TIRCP

### Simpler view to look at projects & how many of the goals it met


In [30]:
value_count_list = keyword_df[['environment', 'expansion', 'safety', 'modernize','integration']]

In [31]:
for c in value_count_list.columns:
    print (value_count_list[c].value_counts())

0    43
1    31
Name: environment, dtype: int64
1    67
0     7
Name: expansion, dtype: int64
0    73
1     1
Name: safety, dtype: int64
1    40
0    34
Name: modernize, dtype: int64
1    41
0    33
Name: integration, dtype: int64


In [32]:
#create a new df out of the results above.
data = [['environment', 31,43], ['expansion',65,9], ['safety', 1,73], ['modernize',40,34], ['integration',41,33]]

df_test = pd.DataFrame(data, columns = ['Goal', 'Projects_in_Category','Projects_NOT_in_Category'])

#percentage of projects that met a goal.
df_test['Percentage'] = ((100* df_test['Projects_in_Category']/(df_test['Projects_in_Category']).sum())).round(0)

#### Results
* 37% of projects had an expansion-related keyword in its keyword component column.
* 23% of projects had an integration-related keyword.
* Only 1 project had a safety related keyword.

In [33]:
df_test

Unnamed: 0,Goal,Projects_in_Category,Projects_NOT_in_Category,Percentage
0,environment,31,43,17.0
1,expansion,65,9,37.0
2,safety,1,73,1.0
3,modernize,40,34,22.0
4,integration,41,33,23.0


In [34]:
Project_Pie_Chart = alt.Chart(df_test).mark_arc().encode(
    theta=alt.Theta(field="Percentage", type="quantitative"),
    color=alt.Color(field="Goal", type="nominal",
                   scale=alt.Scale(range=altair_utils.CALITP_CATEGORY_BOLD_COLORS)
                   ),
)
Project_Pie_Chart

In [35]:
Project_Pie_Chart.save(f"./Charts/Project_Pie_Chart.png")

## 5. Top 5 projects that received the most TIRCP funding but recorded zero expenditures.
* Two projects by BART recorded 0 expenditures. 

In [36]:
df4 = df.loc[df['Progress'] == 'No Expenditures'] 

In [37]:
df5 = (df4.groupby(['Local_Agency','Project_Title',
                    'Award_Year'])
       .agg({'TIRCP_Amount':'sum'})
       .reset_index()
       .rename(columns = {'PPNO':'Count_of_Projects', 'TIRCP_Amount':'Total_TIRCP_Received_with_0_Expenditures', 'Project_Title':'Projects'}) 
      )

In [38]:
#Looking at projects with no expenditures 
df6 = df5.sort_values('Total_TIRCP_Received_with_0_Expenditures').tail(5)
df6

Unnamed: 0,Local_Agency,Projects,Award_Year,Total_TIRCP_Received_with_0_Expenditures
9,"LA County Metropolitan Transportation Authority, So Cal Regional Rail Authority (Metrolink)",Metrolink Antelope Valley Line Capital and Service Improvements,2020,107050000.0
1,Bay Area Rapid Transit District (BART),The Transbaby Corridor Core Capacity Program: Vehicle Acquisition,2020,107100000.0
2,Bay Area Rapid Transit District (BART),The Transbay Corridor Core Capacity Program: Vehicle Acquistion and Communications-Based Train Control System,2018,318600000.0
36,Santa Clara Valley Transportation Authority,"VTA’s BART Silicon Valley Extension, Phase II",2018,730000000.0
13,Los Angeles County Metropolitan Transportation Authority,Los Angeles Region Transit System Integration and Modernization Program of Projects,2018,1088499000.0


In [39]:
projects_no_expenditures = TIRCP_functions.basic_bar_chart(df6, 'Projects','Total_TIRCP_Received_with_0_Expenditures','Projects') 
projects_no_expenditures

## 6. TIRCP funding by districts - 
* 8 projects do not have a district tagged.
* District-1 - Eureka
* District-2 - Redding
* District-3 - Marysville / Sacramento
* District-4 - Bay Area / Oakland
* District-5 - San Luis Obispo / Santa Barbara
* District-6 - Fresno / Bakersfield
* District-7 - Los Angeles
* District-8 - San Bernardino / Riverside
* District-9 - Bishop
* District-10 - Stockton
* District-11 - San Diego
* District-12 - Orange County

In [40]:
df.District.isna().sum()

8

In [41]:
df8 = df.groupby(['District']).agg({'PPNO':'count', 'TIRCP_Amount':'sum'}).rename(columns = {'PPNO':'Total_Projects', 'TIRCP_Amount':'TIRCP_Received'}).reset_index()

In [42]:
df8 = df8.sort_values('TIRCP_Received', ascending = False)
df8

Unnamed: 0,District,Total_Projects,TIRCP_Received
8,District 7: Los Angeles,17,2458538000.0
5,District 4: Bay Area / Oakland,14,868506000.0
10,Various,8,690852000.0
0,District 10: Stockton,4,410153000.0
1,District 11: San Diego,6,175947000.0
4,District 3: Marysville / Sacramento,5,128291000.0
2,District 12: Orange County,3,58937000.0
9,District 8: San Bernardino / Riverside,3,54204000.0
6,District 5: San Luis Obispo / Santa Barbara,3,32609000.0
7,District 6: Fresno / Bakersfield,2,15798000.0


In [43]:
District = TIRCP_functions.basic_bar_chart(df8, 'District','TIRCP_Received','District') 
District