# First look/Clean Up

* Helpful Links
* [Priority Population/DAC](https://dot.ca.gov/programs/rail-and-mass-transportation/priority-populations-and-disadvantaged-communities)

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from siuba import *
from calitp import *
from shared_utils import geography_utils, utils
import difflib
from calitp.storage import get_fs
fs = get_fs()
import os

#Formatting the nb 
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/lctop/"
Caltrans_shape = "https://gis.data.ca.gov/datasets/0144574f750f4ccc88749004aca6eb0c_0.geojson?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"
FILE_NAME = "LCTOP_allyears.xlsx"

In [None]:
df1 = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="LCTOP_Projects"))

## Checking out the raw data

In [None]:
df1.shape

In [None]:
df1.info()

In [None]:
#Get percentages of how many null values per column
df1.isnull().sum() * 100 / len(df1)

In [None]:
#Check out that values are what I expect for certain columns
value_count_cols = [
    "project_type","lead_agency","distr_", "project_name", "project_id#", "project_sub_type_ii"]

In [None]:
# https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
'''
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df1[column].value_counts())
    
''' 

In [None]:
#Find nunique values 
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df1[column].nunique())

### Some of the monetary columns aren't filled - double check 
* Re calculate the percentage cols: lctop % of total project funds 
* Re calc that total project request 993414+9913 equals PUC 99314
    * Yes they do.

In [None]:
monetary_cols = [ 'puc_99313_funds', 'puc_99314_funds',
       'total_project_request_99314_+_99313', 'total_lctop_funds',
       'total_cci_funds', 'total_project_cost',
       'lctop_%_of_total_project_funds']

In [None]:
subset = df1[monetary_cols].fillna(0)

In [None]:
#https://stackoverflow.com/questions/42405572/how-to-compare-two-columns-of-the-same-dataframe
#Compare if PUC 99313 + PUC 99314 equal to total LCTOP Fund
subset['total_project_comp_to_total_LCTOP_funds'] = np.where(subset['total_project_request_99314_+_99313'] == subset['total_lctop_funds'], 
                                           'same','different')

In [None]:
#Comapre if LCTOP equals CCI 
subset['total_lctop_cci'] = np.where(subset['total_lctop_funds'] == subset['total_cci_funds'], 
                                           'same','different')

In [None]:
#Calculate out total_project_request_99314_+_99313 and see if the results equal what was provided to me.
subset['99314_and_99313'] = subset['puc_99313_funds'] + subset['puc_99314_funds']
subset['my_calc_vs_og_cal'] = np.where(subset['99314_and_99313'] == subset['total_project_request_99314_+_99313'], 
                                           'same','different')

In [None]:
#Compare total_project_request_99314_+_99313 with total CCI funds
subset['CCI_99313_99314'] = np.where(subset['total_project_request_99314_+_99313'] == subset['total_cci_funds'], 'same',
                                     'different')

In [None]:
#Find value_counts
value_count_cols_2 = ['total_project_comp_to_total_LCTOP_funds', 'total_lctop_cci', 'my_calc_vs_og_cal', 'CCI_99313_99314'] 
for column in subset[value_count_cols_2]:
    print("\n" + column)
    print(subset[column].value_counts())

In [None]:
subset.sample(5)

In [None]:
#subset.loc[subset['my_calc_vs_og_cal'] == 'different'] 

## Clean Up
* Make sure columns are the right data type
* Find which col represents allocated amounts


### Drop rows with a ton of N/As

In [None]:

df2 = df1.dropna(subset=['lead_agency', 'project_id#','project_name', 'distr_'])

In [None]:
f'The original dataframe is {len(df1)}  rows long but after dropping some rows with missing values, the dataframe is {len(df2)} rows long.'

In [None]:
df2.head(1)

In [None]:
f"There are {df2['project_id#'].nunique()} unique project IDS"

### Change Agency Names
* Some agencies have multiple spellings of its name, clean it up

In [None]:
#Many of the same agencies are spelled slightly different ways 
#Rewrite later 
#https://stackoverflow.com/questions/24554723/str-replace-for-multiple-value-replacement
df2['lead_agency'] = (df2['lead_agency']
                      .str.split("(")
                      .str[0]
                      .str.replace('[^A-Za-z\s]+', '')
                      .str.replace("Publlic","Public")
                      .str.replace("Regional Transit Authority","")
                      .str.replace("Agency","")
                      .str.replace("Commision","Commission")
                      .str.replace("Division","")
                      .str.split(",")
                      .str[0]
                      .str.strip()
                     )

In [None]:
df2["lead_agency"] = df2["lead_agency"].replace(
    {
        'Stanislaus County Public Works   Transit': 'Stanislaus County Public Works Transit',
        'Stanislaus County Public Works  Transit':  'Stanislaus County Public Works Transit',
        'Stanislaus County Public WorksTransit':  'Stanislaus County Public Works Transit',
        'Victor ValleyTransit Authority': 'Victor Valley Transit Authority',
        'YubaSutter Transit Authority':'Yuba Sutter Transit Authority',
        'YubaSutter Transit':'Yuba Sutter Transit Authority',
        'Plumas County Transportation  Commission': 'Plumas County Transportation  Commission',
        'Modoc Transportation':'Modoc County Transportation Commission',
         'Los Angeles County Metoropolitan Transportation Authority': 'Los Angeles County Metropolitan Transportation Authority',
        'Calaveras Transit': 'Calaveras Transit Agnecy',
    }
)

In [None]:

df2['lead_agency'].nunique()

In [None]:
#df2['lead_agency'].sort_values().unique()

### Change additional values
* Some values are spelled YES/yes/no/No etc so lower all the characters

In [None]:
boolean_cols = ['agency_service_area_has_a_dac',
              'does_project_benefit_an_ab_1550_dac',
              'status',
              'qualifying_1_2_mile_low_income_buffer_',
               'ab_1550_low_income_community__household',
              'does_project_benefit_an_ab_1550_dac',]

In [None]:
for i in boolean_cols:
    df2[i] = (df2[i]
               .str.strip()
               .str.lower()
               .str.replace('close','closed')
               .str.replace('closedd','closed')
                        )

### Change Counties 

In [None]:
df2['county'] = df2['county'].str.replace('County','')


### Change data types

In [None]:
#Drop some columns especially those with a very low percentage of populated values 
df2 = df2.drop(columns = ['count', '#','column3',
       'column4', 'column5','other_state_policies,_plans,_or_initiatives',
       'describe_policies,_plans,_or_initiatives','#2','_d','contact_name','contact_phone_#','contact_e_mail',
       'authorized_agent_name','authorized_agent_title','project_description__short_','project_sub_type'])

In [None]:
#Coerce date-time columns to the correct type
date_columns = ['qm_tool__date_', 'completion_date','start_date']

for c in date_columns:
    df2[c] = df2[c].apply(pd.to_datetime, errors='coerce')
    

In [None]:
missing_date = pd.to_datetime('2100-01-01')
for i in date_columns:
    df2[i] = (df2[i]
            .fillna(missing_date)
            .apply(pd.to_datetime)
                    )

In [None]:
#Coerce columns from object to flat
float_columns = ['ridership_increase','fossil_fuel_use_reduction__transportation_']

for c in float_columns:
    df2[c] = df2[c].apply(pd.to_numeric, errors = 'coerce')
    

In [None]:
df2 = df2.fillna(df2.dtypes.replace({'float64': 0.0, 'object': 'None'}))

### Check negative values

In [None]:
env_cols = ['vmt_reduction',
       'ghg_reduction__mtco2e_', 
       'diesel_pm_reductions__lbs_', 'nox_reductions__lbs_',
       'pm_2_5_reductions__lbs_', 'reactive_organic_gas_reduction__lbs_',
       'fossil_fuel_use_reduction__transportation_',
       'fossil_fuel_use_reduction__energy_',
       'renewable_energy_generation__kwh_',]

In [None]:
subset_env = df2[env_cols]

In [None]:
subset_env.lt(0).sum()

In [None]:
#Find nunique values 
for column in df2[value_count_cols]:
    print("\n" + column)
    print(df2[column].nunique())

### Save it

In [None]:
'''
with pd.ExcelWriter(f"{GCS_FILE_PATH}LCTOP_cleaned.xlsx") as writer:
    df2.to_excel(writer, sheet_name="cleaned", index=False)
 '''

## Initial insights

### [Disadvantaged Communities](https://dot.ca.gov/programs/rail-and-mass-transportation/priority-populations-and-disadvantaged-communities)
* Most projects benefit DAC.
* Most agencies have a service area with DAC.

In [None]:
#count 
#https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html
result = {}
for i in df2[boolean_cols]:
    result[i] = (df2[i].value_counts()) 

In [None]:
result_df = (pd.DataFrame.from_dict(result)
             .fillna(0)
             .rename(columns =
                     {'index':'Boolean Values'})
             .div(result_df.sum(axis=0), axis=1)
        
            )



In [None]:
result_df.round(2).style.background_gradient()

### Summary Tables

In [None]:
#Clean column titles
def cols_cleanup(df):
    df.columns = (df.columns
                  .str.replace('[_]', ' ')
                  .str.title()
                  .str.strip()
                 )
    return df

In [None]:
sum_cols = ['funds_to_benefit_dac','total_project_request_99314_+_99313', 'total_project_cost','vmt_reduction',
       'ghg_reduction__mtco2e_', 
       'diesel_pm_reductions__lbs_', 'nox_reductions__lbs_',
       'pm_2_5_reductions__lbs_', 'reactive_organic_gas_reduction__lbs_',
       'fossil_fuel_use_reduction__transportation_','ridership_increase',
       'fossil_fuel_use_reduction__energy_', 'renewable_energy_generation__kwh_']
nunique_cols = ['project_id#', 'lead_agency']

In [None]:
funding_year_summary = geography_utils.aggregate_by_geography(
    df2, 
    group_cols=['funding_year'],
    nunique_cols = nunique_cols,
    sum_cols = sum_cols)

funding_year_summary = (cols_cleanup(funding_year_summary)
                        .sort_values('Funding Year')
                        .rename(columns = {'Lead Agency':'# of Agencies',
                                           'Project Id#': '# of Projects'})
                                           )

In [None]:
funding_year_summary

In [None]:
district_summary = geography_utils.aggregate_by_geography(
    df2, 
    group_cols= ['distr_'],
    nunique_cols = nunique_cols,
    sum_cols = sum_cols,)
district_summary = (cols_cleanup(district_summary)
                        .sort_values('Distr')
                        .rename(columns = {'Lead Agency':'# of Agencies',
                                           'Project Id#': '# of Projects'})
                                           )
district_summary

In [None]:
project_type_summary = geography_utils.aggregate_by_geography(
    df2, 
    group_cols= ['project_sub_type_ii'],
    nunique_cols = nunique_cols,
    sum_cols = sum_cols,)

project_type_summary = (cols_cleanup(project_type_summary)
                        .sort_values('Lead Agency')
                        .rename(columns = {'Lead Agency':'# of Agencies',
                                           'Project Id#': '# of Projects'})
                                           )
project_type_summary

## Geodataframe

In [None]:
geojson = (gpd.read_file(f'{Caltrans_shape}')
               .to_crs(epsg=4326))
#Keep only the columns of interest 
geojson = geojson[["DISTRICT", "Shape_Length", "Shape_Area", "geometry"]]

In [None]:
#Merge geojson with the summarized df
gdf1 = geojson.merge(
    district_summary, how="inner", left_on="DISTRICT", right_on="Distr") 
  

In [None]:
(gdf1
 .plot(figsize=(12, 6), column="# of Projects", legend=True)
 .axis("off")
)

In [None]:
(gdf1
 .plot(figsize=(12, 6), column="Total Project Request 99314 + 99313", legend=True)
 .axis("off")
)

### Lat Lon

In [None]:
# save to GCS
def geojson_gcs_export(gdf, GCS_FILE_PATH, FILE_NAME):
    """
    Save geodataframe as parquet locally,
    then move to GCS bucket and delete local file.

    gdf: geopandas.GeoDataFrame
    GCS_FILE_PATH: str. Ex: gs://calitp-analytics-data/data-analyses/my-folder/
    FILE_NAME: str. Filename.
    """
    gdf.to_file(f"./{FILE_NAME}.geojson", driver="GeoJSON")
    fs.put(f"./{FILE_NAME}.geojson", f"{GCS_FILE_PATH}{FILE_NAME}.geojson")
    os.remove(f"./{FILE_NAME}.geojson")

In [None]:
# call the function
geojson_gcs_export(
    gdf1,
    "gs://calitp-analytics-data/data-analyses/lctop/",
    "lctop_geojson",
)

In [None]:
#Get only values w/ lon and lat
df3 = (df2.loc[df2["project_location"] != 'None'])

In [None]:
df3[['lon', 'lat']] = df1['project_location'].str.split(' ', 1, expand=True)

In [None]:
geo_list = ['lon','lat']
for c in geo_list:
    df3[c] = (df3[c]
              .str.replace(",", "")
              .str.replace(";","
              
             "))

In [None]:
df3.head(2)

In [None]:
#gdf2 = geography_utils.create_point_geometry(df3, 'lon','lat')

In [None]:
airtable = to_snakecase(pd.read_csv('gs://calitp-analytics-data/data-analyses/5311 /organizations-AllOrganizations_1.csv'))


In [None]:
airtable.head(2)