# Exploring 10 Year Non Shopp Data
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [41]:
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *
from shared_utils import portfolio_utils

In [42]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [43]:
import _utils

In [44]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

## Investment Plan

In [45]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan Project List 11-7-22.xlsx")
)



### Look at null values for each column

### Some Cleaning

In [46]:
# Clean up monetary columns
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan[
    "total_project_cost__$1,000_"
].replace(
    {
        "Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000": 344000,
        "$85,738 (Cycle 2)": 85738,
        "Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000": 35000,
        "$214,000\nPhase 1: $82,000\nPhase 2: $132,000": 214000,
        "$-": 0,
        "TBD": 0,
    }
)

In [47]:
# Fill in total project cost with fake number and cast to float64
df_investment_plan["total_project_cost__$1,000_"] = (
    df_investment_plan["total_project_cost__$1,000_"].astype("float64").fillna(888888.0)
)

In [48]:
# Coerce dates to right format
date_cols = [
    "pid_approval_date",
    "rtl_date",
    "con_start_date",
]
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [49]:
# Fill na based on types
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [50]:
# Titlecase string columns
string_cols = [
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "urban_rural",
    "current_phase",
    "project_name",
    "potential_funding_program_s_",
]

In [51]:
for c in string_cols:
    df_investment_plan[c] = df_investment_plan[c].str.title().str.strip()

In [52]:
# Correct some curent phase values that read in weirdly
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "ps&e\nr/w": "ps&e",
        "other\nplan/concept": "other\plan\concept",
        "pid\nplan/concept": "pid\plan\concept",
        "ps&e\nrow": "ps&e\row",
    }
)

# Upper case phases & counties
uppercase_cols = ["current_phase", "county"]
for c in uppercase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.upper().str.strip()

In [53]:
# Clean up counties
def count_by_commas(df, col_to_count: str, new_col_name:str): 
    df[new_col_name] = (
    df[col_to_count]
    .apply(lambda x: len(x.split(",")))
    .astype("int64")) 
    return df 

In [54]:
# Count number of counties this project crosses
df_investment_plan = count_by_commas(df_investment_plan, 'county','number_of_counties')

In [55]:
# New column that summarizes whether a project is in one or more counties
def various_counties(row):
    if row["number_of_counties"] == 1:
        return "One County"
    else:
        return "Multiple Counties"

In [56]:
df_investment_plan['various_or_single_county'] = df_investment_plan.apply(lambda x: various_counties(x), axis=1)

In [57]:
df_investment_plan.head(1)

Unnamed: 0,ea,ppno,ct_project_id,district,county,route,beg_pm,end_pm,urban_rural,project_name,project_description,hq_priority,district_priority,previous_caltrans_nominations,primary_mode,secondary_mode_s_,shs_capacity_increase_detail,potential_funding_program_s_,notes,lead_agency,current_phase,pid_approval_date,target_pa_ed,rtl_date,con_start_date,funding_need_phase_s_,"total_project_cost__$1,000_","pa_ed_cost__$1,000_","ps_e_cost__$1,000_","row_cost__$1,000_","con_cost__$1,000_","non_infrastructure___plan_cost__$1,000_","total_unfunded_need__$1,000_",previous_funding_request,previous_funding_request_phase,last_scored,mode_shift__csis_,vmt__csis_,dac_local_community_needs__csis_,public_engagement__csis_,safety__csis_,zev__csis_,climate_resiliency__csis_,natural_resources_and_ecosystems__csis_,infill_development_and_land_use__csis_,csis_total_score__out_of_45_,csis_alignment,benefits_to_dac_and_advancing_equity__atp_,community_need__atp_,safety__atp_,public_participation__atp_,community_feedback__atp_,continued_engagement__atp_,context_sensitive_and_innovation__atp_,transformative__atp_,atp_total_score__out_of_100_,atp_alignment,access_alignment,_2023,_2024,_2025,_2026,_2027,_2028,_2029,_2030,_2031,_2032,_2033,number_of_counties,various_or_single_county
0,0C660,1095,113000023,1.0,DN,,24.4,R27.80,Rural,Crescent City Ada,"Traffic calming, non-motorized and multi-modal improvements. Enhance safety for pedestrians and non-motorized vehicles",0.0,0.0,SHOPP,Complete Streets,Bike/Pedestrian,,At Infra Invest\nAtp\nRaise,PID Completed,Caltrans,CLOSEOUT,datetime64[ns],,datetime64[ns],datetime64[ns],,4100.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Not Well-Aligned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Not Well-Aligned,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,One County


### Correct Projects that are in the wrong district

In [58]:
# Correct the LA project that is coded in district 8 mistakenly
# df_investment_plan[(df_investment_plan["full_county_name"] == 'Los Angeles') & (df_investment_plan['district'] == 8)]['project_name']

In [59]:
df_investment_plan.loc[(df_investment_plan["project_name"] == "Polb Terminal Island Wye Track Realignment"), "district"] = "7"

In [60]:
# Correct Kern from bishop 9 to fresno 6
# df_investment_plan[(df_investment_plan["full_county_name"] == 'Kern') & (df_investment_plan['district'] == 9)]['project_name'].unique()

In [61]:
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Sr 58 Truck Climbing Lanes Segment 2'), "district"] = "6"
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Mojave To Boron Freeway'), "district"] = "6"
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Sr 58/California City Blvd. Extension'), "district"] = "6"

In [62]:
df_investment_plan.groupby(['county', 'district']).agg({'ppno':'nunique'})

Unnamed: 0_level_0,Unnamed: 1_level_0,ppno
county,district,Unnamed: 2_level_1
ALA,4.0,2
ALA,75.0,1
"ALA, CC, SOL, YOL, SAC",75.0,1
"ALA, SCL",75.0,1
AMA,10.0,1
BUT,3.0,2
CAL,10.0,2
CC,4.0,2
CC,75.0,1
DN,1.0,2


### Duplicated PPNO?

In [63]:
# Check that each row represents a different project
# PPNO and Project ID have a lot less unique value b/c 70% and 56% of each col are not filled in
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")

ppno : 204
ct_project_id : 300
project_name : 742


In [64]:
# One project is included twice
df_investment_plan["project_name"].value_counts().head()

Delano To Pixely 6-Lane                                                       2
Crescent City Ada                                                             1
Rte 55 Btw Sr 73 And I-5, Managed Lanes Project                               1
Reconnecting Grand Avenue "Complete Streets To Transit" And Safety Project    1
I-5 Btw Rte 73 And Sd Co, Managed Lanes Project                               1
Name: project_name, dtype: int64

In [65]:
# Check and make sure there aren't duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [66]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [67]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [68]:
# Check out these rows in our df
# are these duplicates?
len(df_investment_plan[df_investment_plan["ppno"].isin(duplicate_ppno_list)])

24

In [69]:
duplicate_projects_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]

In [82]:
duplicate_projects_df[
    ["ppno", "project_name", "current_phase", "ea", "project_description"]
]

Unnamed: 0,ppno,project_name,current_phase,ea,project_description
159,2830,Us 101 Avila Beach Dr. Interchange,PS&E,1G480,"Construct operational improvements and a park-and-ride lot. A roundabout is proposed to be constructed at the intersection of the US 101 southbound ramps, Avila Beach Dr., and Shell Beach Rd. Operational and-or additional safety enhancements will be considered for the US 101 northbound ramps-Monte Rd. intersection."
162,2830,Us 101/ Avila Beach Rd I/C Improvement And Park-And-Ride Lot,PS&E\nR/W,1G480,Interchange operational improvements for US 101 SB ramps at Avila Beach Dr. at Shell Beach Rd. Construct Park-and-Ride Lot at southwest corner of Avila Beach Dr. and Shell Beach Rd near Pismo Beach
178,6288,South Fresno I/C Project,PA&ED,0H240,Modify interchanges
182,7036,Sr 180 West Extension,CANDIDATE,0Y230,"In Fresno County, near Mendota from I-5 to Route 33, construct 2-lane conventional highway on new alignment. This project would propose to construct new alignments and covert existing County roadways to a State Route."
184,7036,180 West Extension,PID,0Y230,Construct highway
189,6288,South Fresno State Route 99 Corridor Project,PA&ED,0H240,"In Fresno County, in and near Fresno, from 0.4 mile south of American Avenue to 0.4 mile north of North Avenue. Modify interchanges."
193,6694,Grapevine I/C,PA&ED,0R100,Relocate I/C
194,3386E,Sr 46 Gap Closure Segment 4C,PS&E/ROW,44256,"From Brown Material Rd to Farnsworth Ave, 2C to 4E"
203,6694,Grapevine Interchange,NONE,0R100,"In Kern County, near Grapevine from 0.7 mile south of the Grapevine UC to 0.5 mile south of the California Aqueduct, relocate interchange.\nThis project is mitigation to the Grapevine Specific and Community Plan EIR."
206,3386E,Sr-46 Expressway - Segment 4C (Kern)\nSr 46 Gap Closure Segment 4C,CON,44256,"44256 In and near Lost Hills, from 1 mile west of Browns Material Road to California Aquaduct. Convert from 2-lane conventional highway to 4-lane expressway.\n\nFrom Brown Material Road to Farnsworth Avenue, convert 2-lane conventional highway to a 4-lane expressway. Possibly drop if it is fully funded."


### Add Fake Values

In [74]:
df_investment_plan2 = df_investment_plan.copy()

In [80]:
df_investment_plan2.columns

Index(['ea', 'ppno', 'ct_project_id', 'district', 'county', 'route', 'beg_pm',
       'end_pm', 'urban_rural', 'project_name', 'project_description',
       'hq_priority', 'district_priority', 'previous_caltrans_nominations',
       'primary_mode', 'secondary_mode_s_', 'shs_capacity_increase_detail',
       'potential_funding_program_s_', 'notes', 'lead_agency', 'current_phase',
       'pid_approval_date', 'target_pa_ed', 'rtl_date', 'con_start_date',
       'funding_need_phase_s_', 'total_project_cost__$1,000_',
       'pa_ed_cost__$1,000_', 'ps_e_cost__$1,000_', 'row_cost__$1,000_',
       'con_cost__$1,000_', 'non_infrastructure___plan_cost__$1,000_',
       'total_unfunded_need__$1,000_', 'previous_funding_request',
       'previous_funding_request_phase', 'last_scored', 'mode_shift__csis_',
       'vmt__csis_', 'dac_local_community_needs__csis_',
       'public_engagement__csis_', 'safety__csis_', 'zev__csis_',
       'climate_resiliency__csis_', 'natural_resources_and_ecosystems_

#### Create fake metrics

In [75]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [76]:
# Fake columns
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_trasnportation",
]

In [77]:
# Add fake metric columns
for i in fake_columns:
    df_investment_plan2[i] = np.round(
        np.random.uniform(0.0, 50.0, size=(len(df_investment_plan), 1)), 2
    )

#### Create fake current funds

In [78]:
# Create a column for fake funds requested - there only seems to be a total project cost col?
# df_investment_plan2["current_fake_fund_requested"] = np.round(
    # np.random.uniform(100000, 500000.0, size=(len(df_investment_plan), 1)), 2
# )

#### Create fake benefit score

In [83]:
# Create a fake benefit score
df_investment_plan2["fake_benefit_score"] = (
    (df_investment_plan2[fake_columns].sum(axis=1))
    / (df_investment_plan2['total_unfunded_need__$1,000_'])
) * 10000

In [84]:
# Create a fake statewide project rank
df_investment_plan2["statewide_rank"] = df_investment_plan2["fake_benefit_score"].rank(
    ascending=False
)

### Portfolio Prep

#### Add Agency Names

In [85]:
# Add agency names
agency_names = portfolio_utils.add_agency_name()

In [86]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    agency_names,
    left_on="lead_agency",
    right_on="calitp_agency_name",
    how="left",
)

#### Add Full County Names & Remap Districts to avoid 75/74

In [87]:
# Map full county names & districts 
# To avoid certain counties being under 75/74/etc
df_county = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}full_counties.xlsx")
)

df_county = (df_county
             .rename(columns = {'county':'full_county_name',
                               'district':'district_not_original_df'})
            )

In [88]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    df_county,
    left_on="county",
    right_on="abbrev_",
    how="left",
    indicator = True
)

In [89]:
df_investment_plan2["full_county_name"] = df_investment_plan2["full_county_name"].fillna("Various")

In [90]:
# Fill in nan values of district_not_original_df with original district values
df_investment_plan2['district_not_original_df'] = df_investment_plan2['district_not_original_df'].fillna(df_investment_plan2["district"])

In [102]:
# Double check results
# df_investment_plan2[['full_county_name','district','district_not_original_df']].sort_values('full_county_name').drop_duplicates()

#### Rename Districts to mimic portfolio

In [92]:
df_investment_plan2["district_not_original_df"] = df_investment_plan2["district_not_original_df"].astype("int64")

In [93]:
# Add official Caltrans District names
district_dictionary = {
    7: "07 - Los Angeles",
    4: "04 - Oakland",
    2: "02 - Redding",
    9: "09 - Bishop",
    10: "10 - Stockton",
    11: "11 - San Diego",
    3: "03 - Marysville",
    12: "12 - Irvine",
    8: "08 - San Bernardino",
    5: "05 - San Luis Obispo",
    6: "06 - Fresno",
    1: "01 - Eureka",
    75: "75 - HQ",
    74: "74 - HQ",
    0: "None",
}

In [94]:
df_investment_plan2["district_full_name"] = df_investment_plan2["district_not_original_df"].replace(
    district_dictionary
)

In [103]:
# Check and make sure counties - district relationship makes sense. 
# df_investment_plan2.groupby(['full_county_name', 'district_not_original_df']).agg({'project_name':'nunique'})

#### Drop unwanted columns

In [96]:
# Del old district, rename district_not_original_df district name 
df_investment_plan2 = (df_investment_plan2.drop(columns = ['abbrev_', '_merge', 'district'])
                       .rename(columns = {'district_not_original_df':'district'})
                      ) 


In [97]:
# Create a fake district project rank for fake benefit score
df_investment_plan2["district_rank"] = df_investment_plan2.groupby("district")[
    "fake_benefit_score"
].rank(ascending=False)

### Some more checks 

In [98]:
df_investment_plan2.project_name.nunique()

742

In [99]:
len(df_investment_plan2)

743

In [104]:
df_investment_plan2.district.value_counts().sum()

743

#### Save

In [105]:
df_investment_plan2.to_excel(f"{GCS_FILE_PATH}fake_data.xlsx", sheet_name='fake', index= False)