# First look

* [Priority Population/DAC](https://dot.ca.gov/programs/rail-and-mass-transportation/priority-populations-and-disadvantaged-communities)

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp import *
from calitp.storage import get_fs
from shared_utils import geography_utils, utils

fs = get_fs()
import os

# Formatting the nb
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

# Scripts
import A1_data_prep as data_prep

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/lctop/"
FILE_NAME = "LCTOP_allyears.xlsx"

In [None]:
#Original df 
df1 = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="LCTOP_Projects")
)

In [None]:
#Cleaned df
df2 = data_prep.clean_lctop()

In [None]:
district_summary, gdf_dist = data_prep.district_fy_summary()

## Checking out the raw data

In [None]:
df1.shape

In [None]:
# Get percentages of how many null values per column
df1.isnull().sum() * 100 / len(df1)

In [None]:
# Check out that values are what I expect for certain columns
value_count_cols = [
    "project_type",
    "lead_agency",
    "distr_",
    "project_name",
    "project_id#",
    "project_sub_type_ii",
]

In [None]:
# https://stackoverflow.com/questions/32589829/how-to-get-value-counts-for-multiple-columns-at-once-in-pandas-dataframe
"""
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df1[column].value_counts())
    
"""

In [None]:
# Find nunique values
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df1[column].nunique())

In [None]:
#Check that there aren't multiple spellings of the same agency
#df1['lead_agency'].sort_values().unique()

### Double checking monetary columns


In [None]:
monetary_cols = [
    "puc_99313_funds",
    "puc_99314_funds",
    "total_project_request_99314_+_99313",
    "total_lctop_funds",
    "total_cci_funds",
    "total_project_cost",
    "lctop_%_of_total_project_funds",
]

In [None]:
subset = df1[monetary_cols].fillna(0)

In [None]:
# https://stackoverflow.com/questions/42405572/how-to-compare-two-columns-of-the-same-dataframe
# Compare if PUC 99313 + PUC 99314 equal to total LCTOP Fund
subset["total_project_comp_to_total_LCTOP_funds"] = np.where(
    subset["total_project_request_99314_+_99313"] == subset["total_lctop_funds"],
    "same",
    "different",
)

In [None]:
# Comapre if LCTOP equals CCI
subset["total_lctop_cci"] = np.where(
    subset["total_lctop_funds"] == subset["total_cci_funds"], "same", "different"
)

In [None]:
# Calculate out total_project_request_99314_+_99313 and see if the results equal what was provided to me.
subset["99314_and_99313"] = subset["puc_99313_funds"] + subset["puc_99314_funds"]
subset["my_calc_vs_og_cal"] = np.where(
    subset["99314_and_99313"] == subset["total_project_request_99314_+_99313"],
    "same",
    "different",
)

In [None]:
# Compare total_project_request_99314_+_99313 with total CCI funds
subset["CCI_99313_99314"] = np.where(
    subset["total_project_request_99314_+_99313"] == subset["total_cci_funds"],
    "same",
    "different",
)

In [None]:
# Find value_counts
value_count_cols_2 = [
    "total_project_comp_to_total_LCTOP_funds",
    "total_lctop_cci",
    "my_calc_vs_og_cal",
    "CCI_99313_99314",
]
for column in subset[value_count_cols_2]:
    print("\n" + column)
    print(subset[column].value_counts())

In [None]:
# subset.loc[subset['my_calc_vs_og_cal'] == 'different']

### Check negative values

In [None]:
env_cols = [
    "vmt_reduction",
    "ghg_reduction__mtco2e_",
    "diesel_pm_reductions__lbs_",
    "nox_reductions__lbs_",
    "pm_2_5_reductions__lbs_",
    "reactive_organic_gas_reduction__lbs_",
    "fossil_fuel_use_reduction__transportation_",
    "fossil_fuel_use_reduction__energy_",
    "renewable_energy_generation__kwh_",
]

In [None]:
subset_env = df1[env_cols]

In [None]:
subset_env.lt(0).sum()

In [None]:
# Find nunique values
for column in df1[value_count_cols]:
    print("\n" + column)
    print(df2[column].nunique())

### Other steps
* Double check that agencies are only spelled in ONE way, not in multiple ways.
* Change any yes/no open/close columns to lower and strip spaces, so there aren't diffrent options.
* Removed county.
* Make sure all cols are the same data type

In [None]:
#Check values manually 
#df2['lead_agency'].sort_values().unique()

## Initial insights

### How are projects benefitting DAC, are they finished or in progress, how many null values are present, etc?
* [Disadvantaged Communities](https://dot.ca.gov/programs/rail-and-mass-transportation/priority-populations-and-disadvantaged-communities)
* Most projects benefit DAC.
* Most agencies have a service area with DAC.

In [None]:
# count
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html
result = {}
for i in df2[boolean_cols]:
    result[i] = df2[i].value_counts()

In [None]:
result_df = (
    pd.DataFrame.from_dict(result).fillna(0).rename(columns={"index": "Boolean Values"})
)

result_df = result_df.div(result_df.sum(axis=0), axis=1)

In [None]:
result_df.round(2).style.background_gradient()

### Summary Tables

In [None]:
sum_cols = [
    "funds_to_benefit_dac",
    "total_project_request_99314_+_99313",
    "total_project_cost",
    "vmt_reduction",
    "ghg_reduction__mtco2e_",
    "diesel_pm_reductions__lbs_",
    "nox_reductions__lbs_",
    "pm_2_5_reductions__lbs_",
    "reactive_organic_gas_reduction__lbs_",
    "fossil_fuel_use_reduction__transportation_",
    "ridership_increase",
    "fossil_fuel_use_reduction__energy_",
    "renewable_energy_generation__kwh_",
]
nunique_cols = ["project_id#", "lead_agency"]

In [None]:
funding_year_summary = geography_utils.aggregate_by_geography(
    df2, group_cols=["funding_year"], nunique_cols=nunique_cols, sum_cols=sum_cols
)

funding_year_summary = (
    cols_cleanup(funding_year_summary)
    .sort_values("Funding Year")
    .rename(columns={"Lead Agency": "# of Agencies", "Project Id#": "# of Projects"})
)

In [None]:
funding_year_summary

In [None]:
project_type_summary = geography_utils.aggregate_by_geography(
    df2,
    group_cols=["project_sub_type_ii"],
    nunique_cols=nunique_cols,
    sum_cols=sum_cols,
)

project_type_summary = (
    cols_cleanup(project_type_summary)
    .sort_values("Lead Agency")
    .rename(columns={"Lead Agency": "# of Agencies", "Project Id#": "# of Projects"})
)

In [None]:
project_type_summary['# of Projects'].sum()

In [None]:
project_type_summary['Funds To Benefit Dac'].sum()

In [None]:
district_summary['Funds To Benefit Dac'].sum()

In [None]:
district_summary['# of Projects'].sum()

## Lat Lon

In [None]:
# Get only values w/ lon and lat
df3 = df2.loc[df2["project_location"] != "None"]

In [None]:
df3["project_location_2"] = df1["project_location"].str.split(";").str[0]

In [None]:
df3[["lon", "lat"]] = (
    df3["project_location_2"]
    .str.split(",", 1, expand=True)
    .apply(pd.to_numeric, errors="coerce")
    .fillna(0)
)

In [None]:
df3 = df3[(df3["lon"] != 0) & (df3["lat"] != 0)]
len(df3)

In [None]:
type(gdf2)