# First look/Clean Up

* Helpful Links
* [Priority Population/DAC](https://dot.ca.gov/programs/rail-and-mass-transportation/priority-populations-and-disadvantaged-communities)

In [72]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *
from shared_utils import geography_utils, utils
import difflib

#Formatting the nb 
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

In [73]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/lctop/"
FILE_NAME = "LCTOP_allyears.xlsx"

In [74]:
df1 = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="LCTOP_Projects"))

## Checking out the raw data

In [75]:
#Get percentages of how many null values per column
df1.isnull().sum() * 100 / len(df1)

count                                              0.70
#                                                  0.70
funding_year                                       0.70
_d                                                 0.70
distr_                                             0.70
project_id#                                        0.70
lead_agency                                        0.70
project_name                                       0.70
project_type                                       0.58
project_sub_type                                  27.42
project_sub_type_ii                                0.70
#2                                                99.53
project_description__short_                       11.79
project_location                                  28.12
start_date                                        12.02
completion_date                                   27.30
contact_name                                       1.05
contact_phone_#                                 

In [76]:
#Check out that values are what I expect.
value_count_cols = [
    "project_type","lead_agency","distr_", "project_name", "project_id#", "project_sub_type_ii"]

In [77]:
# https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df1[column].value_counts())


project_type
Operations    487
Capital       364
851             1
Name: project_type, dtype: int64

lead_agency
Sacramento Regional Transit District                                       27
San Francisco Municipal Transportation Agency                              17
Placer County                                                              16
Orange County Transportation Authority                                     16
Madera County Transportation Commission                                    15
Victor Valley Transit Authority                                            14
Eastern Contra Costa Transit Authority                                     13
Ventura County Transportation Commission                                   12
Riverside Transit Agency                                                   12
Kings County Area Public Transit Agency                                    12
Alameda-Contra Costa Transit District                                      11
Yolo County Transportation D

In [78]:
#Find nunique values 
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df1[column].nunique())


project_type
3

lead_agency
213

distr_
12

project_name
714

project_id#
851

project_sub_type_ii
12


## Clean Up
* Make sure columns are the right data type
* Maybe add a column that searches through benefits added cols and categorize those
* Re calculate the percentage cols: lctop % of total project funds 
* Re calc that total project request 993414+9913 equals PUC 99314
* Compare if my calcs match the data's calc
* Find which col represents allocated amounts


### Drop rows with a ton of N/As

In [79]:

df2 = df1.dropna(subset=['lead_agency', 'project_id#','project_name', 'distr_'])

In [112]:
df2.sample(3)

Unnamed: 0,funding_year,distr_,project_id#,lead_agency,project_name,project_type,project_sub_type,project_sub_type_ii,project_description__short_,project_location,start_date,completion_date,address,"city,_state,_zip_code",county,contributing_sponsor,puc_99313_funds,puc_99314_funds,total_project_request_99314_+_99313,total_lctop_funds,total_cci_funds,total_project_cost,lctop_%_of_total_project_funds,rollover_project,has_project_received_other_ggrf_funds,qm_tool__date_,project_life,ridership_increase,vmt_reduction,ghg_reduction__mtco2e_,date_ghg_reductions_begin,diesel_pm_reductions__lbs_,nox_reductions__lbs_,pm_2_5_reductions__lbs_,reactive_organic_gas_reduction__lbs_,fossil_fuel_use_reduction__transportation_,fossil_fuel_use_reduction__energy_,renewable_energy_generation__kwh_,governors_pillars,support_scoping_plan,co_benefits,calenviroscreen_version,agency_service_area_has_a_dac,does_project_benefit_an_ab_1550_dac,dac_data,ab_1550_low_income_community__household,ab_1550_1_2_mile_low_income_buffer_region,ab_1550_benefit_criteria_table,ab_1550_benefit_selected,identify_specific_ab_1550_group_needs_approach,description_of_ab_1550_community_need,ab_1550_benefit_criteria_met,description_of_ab_1550_criteria,qualifying_disadvantaged_community_benefit,funds_to_benefit_dac,qualifying_low_income_benefit_,funds_to_benefit_lichh_,qualifying_1_2_mile_low_income_buffer_,funds_to_benefit_lichh_w_n_1_2_mile_of_a_dac_,status,lon,lat
107,15-16,3.0,15-16-D03-013,Auburn,Bus Stop Enhancement,Capital,,New transit related amenities,"The purpose of the upgrades will comprise of modifying and upgrading bus stops along the cities current bus route in order to increase ridership. The Improvements will include, but are not limited to new built bus shelters, curb and gutter, grading drainage as well as signage. The City intends to use the 2016/17 LCTOP allocation to continue this work.",,2016-07-01 00:00:00,datetime64[ns],"1225 Lincoln Way \nAuburn CA, 95603",,,"99313:\nPlacer County $10,973",10973.0,315.0,11288.0,0.0,0.0,11288.0,1.0,,,datetime64[ns],0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,2.0,No,No,0.0,,,,,,,,,,0.0,,0.0,,0.0,,,
175,15-16,6.0,15-16-D06-083,Kings County Area Public Transit,Renewable Energy,Capital,,Renewable energy/fuel for transit service,FY 15/16 LCTOP funds will be rolled over with FY 16/17 Funds: Install solar panels at our transit maintenance facility and electric charging station to support electric buses,,2017-06-01 00:00:00,datetime64[ns],"610 W 7th Street\nHanford, CA 93230",,,"99313:\nKings County Association of Governments $120,415",120415.0,0.0,120415.0,0.0,0.0,250000.0,0.48,,,datetime64[ns],0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,2.0,Yes,Yes,1.0,,,,,"PHS 4: Reduce heat-related illnesses and increase thermal comfort (e.g., weatherization and solar energy con provide more efficient and affordable air conditioning; urban forestry can reduce heat-island effect).",,"TP 1E: Project creates or improves infrastructure or equipment that reduces air pollution at a station, stop or transit facility in a disadvantaged community.","Benefits Disadvantaged Communities by making capital improvement that increase service efficiency and reliability, and decrease greenhouse gas emissions.",,120415.0,,0.0,,0.0,,,
76,14-15,8.0,14-15-D08-77,Palo Verde Valley Transit,Bus Stop Improvements,Capital,,New transit related amenities,,,datetime64[ns],datetime64[ns],,,,,8562.0,323.0,8885.0,0.0,0.0,0.0,0.0,,,datetime64[ns],0.0,,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,,1.0,Yes,Yes,1.0,,,,,,,,,,8885.0,,0.0,,0.0,,,


### Some agencies have multiple spellings of its name, clean it up

In [81]:
#Many of the same agencies are spelled slightly different ways 
#Rewrite later 
#https://stackoverflow.com/questions/24554723/str-replace-for-multiple-value-replacement
df2['lead_agency'] = (df2['lead_agency']
                      .str.split("(")
                      .str[0]
                      .str.replace(")", "")
                      .str.replace("-","")
                      .str.replace("Publlic","Public")
                      .str.replace("Regional Transit Authority","")
                      .str.replace("Agency","")
                      .str.replace("Division","")
                      .str.split(",")
                      .str[0]
                      .str.strip()
                     )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [82]:
df2["lead_agency"] = df2["lead_agency"].replace(
    {
        'Stanislaus County Public Works   Transit': 'Stanislaus County Public Works Transit',
        'Stanislaus County Public Works  Transit':  'Stanislaus County Public Works Transit',
        'Stanislaus County Public WorksTransit':  'Stanislaus County Public Works Transit',
        'Victor ValleyTransit Authority': 'Victor Valley Transit Authority',
        'YubaSutter Transit Authority':'Yuba Sutter Transit Authority',
        'YubaSutter Transit':'Yuba Sutter Transit Authority',
    }
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [83]:
#A lot of the same agencies are spelled a few different ways 
df2['lead_agency'].nunique()

184

In [84]:
#df2['lead_agency'].sort_values().unique()

In [85]:
f'The original dataframe is {len(df1)}  rows long but after dropping some rows with missing values, the dataframe is {len(df2)} rows long.'

'The original dataframe is 857  rows long but after dropping some rows with missing values, the dataframe is 851 rows long.'

In [86]:
f"There are {df2['project_id#'].nunique()} unique project IDS"

'There are 851 unique project IDS'

In [87]:
#Drop some columns especially those with a very low percentage of populated values 
df2 = df2.drop(columns = ['count', '#','column3',
       'column4', 'column5','other_state_policies,_plans,_or_initiatives',
       'describe_policies,_plans,_or_initiatives','#2','_d','contact_name','contact_phone_#','contact_e_mail',
       'authorized_agent_name','authorized_agent_title','project_description__short_','project_sub_type'])

In [88]:
#Coerce date-time columns to the write type
date_columns = ['qm_tool__date_', 'completion_date','start_date']

for c in date_columns:
    df2[c] = df2[c].apply(pd.to_datetime, errors='coerce')
    

In [89]:
df2 = df2.fillna(df2.dtypes.replace({'float64': 0.0, 'object': 'None'}))

In [90]:
#Find nunique values 
for column in df2[value_count_cols]:
    print("\n" + column)
    print(df2[column].nunique())


project_type
2

lead_agency
184

distr_
12

project_name
714

project_id#
851

project_sub_type_ii
12


with pd.ExcelWriter(f"{GCS_FILE_PATH}LCTOP_cleaned.xlsx") as writer:
    df2.to_excel(writer, sheet_name="cleaned", index=False)

## Some initial insights

In [91]:
def millions(df, col_name: str): 
    df['Amt in M'] = (
    "$"
    + (df[col_name].astype(float) / 1000000)
    .round(0)
    .astype(str)
    + "M")
    return df 

In [92]:
#Clean column titles
def cols_cleanup(df):
    df.columns = (df.columns
                  .str.replace('[_]', ' ')
                  .str.title()
                  .str.strip()
                 )
    return df

In [93]:
#df2.loc[df2['description_of_ab_1550_community_need'] != "None"]

In [94]:
sum_cols = ['funds_to_benefit_dac','total_project_request_99314_+_99313',
           'total_cci_funds', 'total_project_cost','vmt_reduction',
       'ghg_reduction__mtco2e_', 
       'diesel_pm_reductions__lbs_', 'nox_reductions__lbs_',
       'pm_2_5_reductions__lbs_', 'reactive_organic_gas_reduction__lbs_',
       'fossil_fuel_use_reduction__transportation_',
       'fossil_fuel_use_reduction__energy_', 'renewable_energy_generation__kwh_']
nunique_cols = ['project_id#', 'lead_agency']

In [116]:
funding_year_summary = geography_utils.aggregate_by_geography(
    df2, 
    group_cols=['funding_year'],
    nunique_cols = nunique_cols,
    sum_cols = sum_cols)
funding_year_summary = cols_cleanup(funding_year_summary).sort_values('Project Id#')



In [117]:
funding_year_summary

Unnamed: 0,Funding Year,Diesel Pm Reductions Lbs,Fossil Fuel Use Reduction Energy,Ghg Reduction Mtco2E,Nox Reductions Lbs,Pm 2 5 Reductions Lbs,Reactive Organic Gas Reduction Lbs,Renewable Energy Generation Kwh,Total Cci Funds,Total Project Cost,Total Project Request 99314 + 99313,Vmt Reduction,Lead Agency,Project Id#
0,14-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24165593.0,0.0,83,95
2,16-17,0.0,0.0,4161658.31,0.0,0.0,0.0,0.0,138818320.0,2625839228.0,34539105.0,10181040666.0,105,126
1,15-16,0.0,0.0,10734.81,0.0,0.0,0.0,0.0,0.0,532800958.42,74700760.0,1193035.95,107,132
3,17-18,72745.82,3110295.0,3615714.54,17961394.95,607338.5,1143040.39,0.0,266660391.74,2526653147.52,96864564.5,7682500431.24,109,152
5,19-20,1428119.23,-576698452.72,2431934.0,11556745.3,1408618.81,4032264.9,3360000.0,313451971.0,2889873923.16,146054354.0,3096126163.97,111,166
4,18-19,343172.65,0.0,3062602.71,11417960.94,469422.6,1476528.45,0.0,303783831.0,4137512637.19,146949406.0,4593429702.34,120,180


In [115]:
district_summary = geography_utils.aggregate_by_geography(
    df2, 
    group_cols= ['distr_'],
    nunique_cols = nunique_cols,
    sum_cols = sum_cols,)
district_summary = cols_cleanup(district_summary).sort_values('Project Id#')
district_summary



Unnamed: 0,Distr,Diesel Pm Reductions Lbs,Fossil Fuel Use Reduction Energy,Ghg Reduction Mtco2E,Nox Reductions Lbs,Pm 2 5 Reductions Lbs,Reactive Organic Gas Reduction Lbs,Renewable Energy Generation Kwh,Total Cci Funds,Total Project Cost,Total Project Request 99314 + 99313,Vmt Reduction,Lead Agency,Project Id#
8,9.0,21.53,-61553.5,444.14,277.04,24.34,86.55,0.0,700198.0,935496.0,362380.0,386370.79,4,15
11,12.0,154.4,-6388189.46,39130.42,10306.38,1674.16,1325.24,0.0,26117353.0,48956604.0,25616337.0,67892585.06,1,16
10,11.0,1369.59,-7248795.34,126157.69,228802.58,1602.73,19125.79,0.0,154731450.0,306428906.0,32311113.0,260716705.43,5,23
0,1.0,103.36,-141055.15,18488.67,1301.34,47.65,225.51,0.0,2715911.0,4180995.0,2301045.0,15128514.51,7,32
1,2.0,139.31,-258294.7,3280.63,2732.16,60.49,295.87,0.0,2962149.0,3951858.0,2403382.0,3959178.92,11,37
9,10.0,7528.09,-139669.98,82335.34,156497.16,6160.99,11867.28,0.0,14635343.0,30664161.16,13235978.5,234371667.56,25,64
4,5.0,7601.57,-6142167.96,46010.35,294845.01,7229.38,13245.5,3360000.0,25210033.0,41062474.0,13469015.0,23483066.77,13,68
7,8.0,2289.36,-5313608.14,110781.08,1576223.73,48751.96,32425.84,0.0,128309885.0,299223477.92,33023402.0,1087298714.26,12,82
6,7.0,4088.65,-76712805.83,329208.21,372677.41,7514.35,39588.94,0.0,142793139.0,409086897.76,164605685.0,370945504.51,22,95
5,6.0,612.94,-2333163.16,36642.95,17759.43,411.95,1649.78,0.0,26075253.74,51279701.74,19597846.0,41941021.1,31,126


In [118]:
project_type_summary = geography_utils.aggregate_by_geography(
    df2, 
    group_cols= ['project_sub_type_ii'],
    nunique_cols = nunique_cols,
    sum_cols = sum_cols,)
project_type_summary = cols_cleanup(project_type_summary).sort_values('Lead Agency')
project_type_summary.sort_values('Project Id#')



Unnamed: 0,Project Sub Type Ii,Diesel Pm Reductions Lbs,Fossil Fuel Use Reduction Energy,Ghg Reduction Mtco2E,Nox Reductions Lbs,Pm 2 5 Reductions Lbs,Reactive Organic Gas Reduction Lbs,Renewable Energy Generation Kwh,Total Cci Funds,Total Project Cost,Total Project Request 99314 + 99313,Vmt Reduction,Lead Agency,Project Id#
11,Vehicles upgrades to support new expanded/enhanced transit service,859.88,0.0,12803.34,37990.82,586.47,2126.01,0.0,3143244.0,3504000.0,3143244.0,21802983.65,1,1
9,Alternative transportation services,19.89,0.0,1110.97,331.6,9.99,66.17,0.0,106174.74,256174.74,90977.0,3224984.0,2,2
10,New Transit related amenities,3.25,0.0,60.87,37.57,3.0,8.02,0.0,248027.0,357764.0,126065.0,140883.79,3,3
7,Renewable energy/fuel for transit service,0.0,0.0,13800.06,750825.94,2830.66,21082.87,0.0,2094541.0,3469886.0,2040465.0,12271180.0,9,9
5,Infrastructure to support new expanded/enhanced transit service,10890.61,-9878941.98,1301255.19,134260.97,5069.71,27306.69,3360000.0,90034547.0,564914743.0,19488007.0,4269784558.71,10,16
3,Network/fare integration,141.43,0.0,2030247.88,1067.91,22.3,228.45,0.0,10942584.0,85352573.0,11422334.0,5496118120.35,12,24
6,New vehicles for new expanded/enhanced transit service,486037.7,-8125021.57,363567.19,7026295.15,445024.89,1116212.43,0.0,69625271.0,441429147.0,16491919.5,774848041.6,19,29
4,Infrastructure to support zero-emission vehicle(s),1265107.83,-452383553.46,2249649.01,10881736.22,1241590.99,4046528.51,0.0,90934033.0,2067255898.0,49612423.0,2570415354.34,25,34
8,New zero-emission vehicles,22940.69,-43045091.03,4743018.64,21518762.42,759207.72,1309086.95,0.0,411156960.0,6497621200.98,102732493.0,5189093150.28,58,123
2,New transit related amenities,50872.82,-59379516.69,2247707.23,604755.18,24038.63,115757.76,0.0,121015245.0,2401306189.0,42217131.0,6389424102.78,67,141


## Geodataframe

In [99]:
district_list = list(df2['distr_'].unique())

In [100]:
df2[['lon', 'lat']] = df1['project_location'].str.split(' ', 1, expand=True)

In [101]:
geo_list = ['lon','lat']

In [102]:

for c in geo_list:
    df2[c] = df2[c].apply(pd.to_numeric, errors='coerce')
    

In [103]:
'''
for c in geo_list:
    df2[c] = (df2[c]
              .str.replace(",", "")
              .str.replace(";"," ")
             )
'''

'\nfor c in geo_list:\n    df2[c] = (df2[c]\n              .str.replace(",", "")\n              .str.replace(";"," ")\n             )\n'

In [104]:
#Subset for geodataframe
df3 = df2[['funding_year', 'distr_','project_name', 'project_type',
       'project_sub_type', 'project_sub_type_ii',
       'project_description__short_','lon',
       'lat','total_lctop_funds',
       'total_cci_funds', 'total_project_cost',]]

In [105]:
df3 = df3[(df3['lon'] != 0.00) & (df3['lat'] != 0.00)]

In [106]:
gdf1 = geography_utils.create_point_geometry(df3, 'lon','lat')

In [107]:
#gdf1.to_file(f"./test_gdf.geojson", driver="GeoJSON")