# Exploring 10 Year Non Shopp Data
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [31]:
import numpy as np
import pandas as pd
from calitp import *
from babel.numbers import format_currency

In [32]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [33]:
import _utils

In [34]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

## Investment Plan

In [35]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan.xlsx")
)



### Look at null values for each column

In [36]:
df_investment_plan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717 entries, 0 to 716
Data columns (total 58 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   ppno                                     217 non-null    object 
 1   ct_project_id                            312 non-null    object 
 2   district                                 713 non-null    float64
 3   county                                   715 non-null    object 
 4   route                                    713 non-null    object 
 5   beg_pm                                   330 non-null    object 
 6   end_pm                                   329 non-null    object 
 7   urban_rural                              631 non-null    object 
 8   project_name                             717 non-null    object 
 9   ea                                       333 non-null    object 
 10  current_phase                            396 non-n

### Some Cleaning

In [37]:
# Clean up monetary columns
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan[
    "total_project_cost__$1,000_"
].replace(
    {
        "Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000": 344000,
        "$85,738 (Cycle 2)": 85738,
        "Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000": 35000,
        "$214,000\nPhase 1: $82,000\nPhase 2: $132,000": 214000,
        "$-": 0,
        "TBD": 0,
    }
)

In [38]:
# Fill in total project cost with fake number and cast to float64
df_investment_plan["total_project_cost__$1,000_"] = (
    df_investment_plan["total_project_cost__$1,000_"]
    .astype("float64")
    .fillna(888888.0)
)

In [39]:
# Coerce dates to right format
date_cols = [
    "pid_approval_date",
    "rtl_date",
    "con_start_date",
]
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [40]:
# Fill na based on types
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [41]:
# Titlecase string columns
string_cols = [
    "county",
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "urban_rural",
    "current_phase",
    "project_name",
    "potential_funding_program_s_",
]

In [42]:
for c in string_cols:
     df_investment_plan[c] = df_investment_plan[c].str.title().str.strip()

In [43]:
# Correct some curent phase values that read in weirdly
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "ps&e\nr/w": "ps&e",
        "other\nplan/concept": "other\plan\concept",
        "pid\nplan/concept": "pid\plan\concept",
        "ps&e\nrow": "ps&e\row",
    }
)

# Upper case everything
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].str.strip().str.upper()

### Duplicated PPNO?

In [44]:
# Check that each row represents a different project
# PPNO and Project ID have a lot less unique value b/c 70% and 56% of each col are not filled in
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")

ppno : 204
ct_project_id : 300
project_name : 716


In [45]:
# One project is included twice
df_investment_plan["project_name"].value_counts().head()

Delano To Pixely 6-Lane                                                                        2
I-5 Btw Rte 73 And Sd Co, Managed Lanes Project                                                1
San Diego - Coronado Bay Bridge Suicide Deterrent Project                                      1
Linking Neighborhoods Of La Jolla Reservation With Bike/Walking Trail On The State Route 76    1
Sr-78 Hov/Managed Lanes                                                                        1
Name: project_name, dtype: int64

In [46]:
# Check and make sure there aren't duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [47]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [48]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [49]:
# Check out these rows in our df
# are these duplicates?
len(df_investment_plan[df_investment_plan["ppno"].isin(duplicate_ppno_list)])

24

In [50]:
duplicate_projects_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]

In [51]:
duplicate_projects_df[
    ["ppno", "project_name", "current_phase", "ea", "project_description"]
]

Unnamed: 0,ppno,project_name,current_phase,ea,project_description
159,2830,Us 101 Avila Beach Dr. Interchange,PS&E,1G480,"Construct operational improvements and a park-and-ride lot. A roundabout is proposed to be constructed at the intersection of the US 101 southbound ramps, Avila Beach Dr., and Shell Beach Rd. Operational and-or additional safety enhancements will be considered for the US 101 northbound ramps-Monte Rd. intersection."
162,2830,Us 101/ Avila Beach Rd I/C Improvement And Park-And-Ride Lot,PS&E\nR/W,1G480,Interchange operational improvements for US 101 SB ramps at Avila Beach Dr. at Shell Beach Rd. Construct Park-and-Ride Lot at southwest corner of Avila Beach Dr. and Shell Beach Rd near Pismo Beach
178,6288,South Fresno I/C Project,PA&ED,0H240,Modify interchanges
182,7036,Sr 180 West Extension,CANDIDATE,0Y230,"In Fresno County, near Mendota from I-5 to Route 33, construct 2-lane conventional highway on new alignment. This project would propose to construct new alignments and covert existing County roadways to a State Route."
184,7036,180 West Extension,PID,0Y230,Construct highway
189,6288,South Fresno State Route 99 Corridor Project,PA&ED,0H240,"In Fresno County, in and near Fresno, from 0.4 mile south of American Avenue to 0.4 mile north of North Avenue. Modify interchanges."
193,6694,Grapevine I/C,PA&ED,0R100,Relocate I/C
194,3386E,Sr 46 Gap Closure Segment 4C,PS&E/ROW,44256,"From Brown Material Rd to Farnsworth Ave, 2C to 4E"
203,6694,Grapevine Interchange,NONE,0R100,"In Kern County, near Grapevine from 0.7 mile south of the Grapevine UC to 0.5 mile south of the California Aqueduct, relocate interchange.\nThis project is mitigation to the Grapevine Specific and Community Plan EIR."
206,3386E,Sr-46 Expressway - Segment 4C (Kern)\nSr 46 Gap Closure Segment 4C,CON,44256,"44256 In and near Lost Hills, from 1 mile west of Browns Material Road to California Aquaduct. Convert from 2-lane conventional highway to 4-lane expressway.\n\nFrom Brown Material Road to Farnsworth Avenue, convert 2-lane conventional highway to a 4-lane expressway. Possibly drop if it is fully funded."


### Add Fake Values

In [52]:
df_investment_plan2 = df_investment_plan[
    [
        "ppno",
        "ct_project_id",
        "district",
        "route",
        "urban_rural",
        "project_name",
        "current_phase",
        "project_description",
        "priority_project",
        "previous_caltrans_nominations",
        "primary_mode",
        "lead_agency",
        "pid_approval_date",
        "target_pa_ed",
        "rtl_date",
        "con_start_date",
        "funding_need_phase_s_",
        "total_project_cost__$1,000_",
        "potential_funding_program_s_"
    ]
]

In [53]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [54]:
# Fake columns
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_trasnportation",
]

In [55]:
# Add fake metric columns
for i in fake_columns:
    df_investment_plan2[i] = np.round(
        np.random.uniform(0.0, 50.0, size=(len(df_investment_plan), 1)), 2
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [56]:
# Create a column for fake funds requested - there only seems to be a total project cost col? 
df_investment_plan2['current_fake_fund_requested'] = np.round(
        np.random.uniform(100000, 500000.0, size=(len(df_investment_plan), 1)), 2
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [58]:
# Create a fake benefit score
df_investment_plan2['fake_benefit_score'] = ((df_investment_plan2[fake_columns].sum(axis=1))/(df_investment_plan2['current_fake_fund_requested']))*10000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [59]:
# Create a fake statewide project rank 
df_investment_plan2['statewide_rank'] = df_investment_plan2['fake_benefit_score'].rank(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [60]:
# Create a fake district project rank 
df_investment_plan2['district_rank'] = df_investment_plan2.groupby('district')['fake_benefit_score'].rank(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [63]:
df_investment_plan2.sample(2)

Unnamed: 0,ppno,ct_project_id,district,route,urban_rural,project_name,current_phase,project_description,priority_project,previous_caltrans_nominations,primary_mode,lead_agency,pid_approval_date,target_pa_ed,rtl_date,con_start_date,funding_need_phase_s_,"total_project_cost__$1,000_",potential_funding_program_s_,increase_peak_person_throughput,reduction_in_peak_period_delay,reduction_in_fatal_and_injury_crashes,reduction_in_injury_rates,increase_access_to_jobs,increase_access_jobs_to_DAC,commercial_dev_developed,tons_of_goods_impacted,improve_air_quality,impact_natural_resources,support_of_trasnportation,current_fake_fund_requested,fake_benefit_score,statewide_rank,district_rank
317,,,7.0,Off,Urban,La County Freight Rail Efficiency Project At Malabar Yard,NONE,The project would replace intermodal railcar storage capacity from BNSF's West Bank Yard to BNSF's Malabar Yard. Approximately 500 feet of new track would be constructed to connect BNSF Malabar Yard with the Los Angeles Railway Junction through east 46th Street. The 49th Street at-grade railroad crossing would be permanently closed.,0.0,DRMT Consolidated Statewide Project List\nINFRA 2021 Caltrans Application\nStimulus Drill 2021\nTCEP Cycle 2 Priority 6 of 26 (Not Awarded),Rail (Freight),Bnsf/La Metro,datetime64[ns],,2023-04-30 00:00:00,datetime64[ns],,85738.0,Crisi\nInfra\nIntercity Pass Rail\nTcep,27.75,41.47,14.13,37.33,45.61,42.44,40.5,28.94,21.99,31.28,7.03,276499.37,12.241258,191.0,22.0
550,,,75.0,Off,Urban,Signal Respacing: Maple To Solow,NONE,Respace existing intermediate signals,0.0,DRMT Consolidated Statewide Project List,Rail (Passenger),Metrolink,datetime64[ns],,datetime64[ns],datetime64[ns],,4900.0,Crisi\nIntercity Pass Rail,9.88,6.46,46.86,5.81,10.62,32.77,3.38,27.77,12.62,25.03,40.83,192315.55,11.545088,217.0,51.0


In [65]:
# Create a new monetary columns that is formatted
monetary_cols = ['total_project_cost__$1,000_', 'current_fake_fund_requested']

In [69]:
 df_investment_plan2.to_excel(f"{GCS_FILE_PATH}fake_data.xlsx", sheet_name='fake', index= False)  