# 10 Year Non Shopp Data
* Generating fake data.
* Initial exploration.
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [1]:
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *
from shared_utils import portfolio_utils



In [2]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import _utils

In [4]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

In [5]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan Project List 11-7-22.xlsx")
)



### Cleaning

In [6]:
# Clean up monetary columns, clean up some of the values. 
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan[
    "total_project_cost__$1,000_"
].replace(
    {
        "Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000": 344000,
        "$85,738 (Cycle 2)": 85738,
        "Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000": 35000,
        "$214,000\nPhase 1: $82,000\nPhase 2: $132,000": 214000,
        "$-": 0,
        "TBD": 0,
    }
)

In [7]:
# Fill in total project cost with fake number and cast to float64
#df_investment_plan["total_project_cost__$1,000_"] = (
    #df_investment_plan["total_project_cost__$1,000_"].astype("float64").fillna(888888.0)
#)

In [8]:
# Coerce dates to right format
date_cols = [
    "pid_approval_date",
    "rtl_date",
    "con_start_date",
]
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [9]:
# Fill na based on dtypes
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [10]:
# Titlecase string columns
string_cols = [
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "urban_rural",
    "current_phase",
    "project_name",
    "potential_funding_program_s_",
]

In [11]:
for c in string_cols:
    df_investment_plan[c] = df_investment_plan[c].str.title().str.strip()

In [12]:
# Correct some curent phase values that read in weirdly
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "ps&e\nr/w": "ps&e",
        "other\nplan/concept": "other\plan\concept",
        "pid\nplan/concept": "pid\plan\concept",
        "ps&e\nrow": "ps&e\row",
    }
)

In [13]:
# Upper case phases & counties
uppercase_cols = ["current_phase", "county"]
for c in uppercase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.upper().str.strip()

In [14]:
# Clean up counties
def count_by_commas(df, col_to_count: str, new_col_name:str): 
    df[new_col_name] = (
    df[col_to_count]
    .apply(lambda x: len(x.split(",")))
    .astype("int64")) 
    return df 

In [15]:
# Count number of counties this project crosses multiple counties.
df_investment_plan = count_by_commas(df_investment_plan, 'county','number_of_counties')

In [16]:
# New column that summarizes whether a project is in one or more counties
def various_counties(row):
    if row["number_of_counties"] == 1:
        return "One County"
    else:
        return "Multiple Counties"

In [17]:
df_investment_plan['various_or_single_county'] = df_investment_plan.apply(lambda x: various_counties(x), axis=1)

### Correct Projects that are in the wrong district

In [18]:
df_investment_plan.loc[(df_investment_plan["project_name"] == "Polb Terminal Island Wye Track Realignment"), "district"] = "7"

In [19]:
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Sr 58 Truck Climbing Lanes Segment 2'), "district"] = "6"
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Mojave To Boron Freeway'), "district"] = "6"
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Sr 58/California City Blvd. Extension'), "district"] = "6"

In [20]:
#df_investment_plan.groupby(['county', 'district']).agg({'ppno':'nunique'})

### Duplicated PPNO?

In [21]:
# Check that each row represents a different project
# PPNO and Project ID have a lot less unique value b/c 70% and 56% of each col are not filled in
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")

ppno : 204
ct_project_id : 300
project_name : 742


In [22]:
# One project is included twice
df_investment_plan["project_name"].value_counts().head()

Delano To Pixely 6-Lane                                                       2
Crescent City Ada                                                             1
Rte 55 Btw Sr 73 And I-5, Managed Lanes Project                               1
Reconnecting Grand Avenue "Complete Streets To Transit" And Safety Project    1
I-5 Btw Rte 73 And Sd Co, Managed Lanes Project                               1
Name: project_name, dtype: int64

In [23]:
# Check and make sure there aren't duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [24]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [25]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [26]:
# Check out these rows in our df
# are these duplicates?
len(df_investment_plan[df_investment_plan["ppno"].isin(duplicate_ppno_list)])

24

In [27]:
duplicate_projects_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]

In [28]:
duplicate_projects_df[
    ["ppno", "project_name", "current_phase", "ea", "project_description"]
].sort_values('project_name')

Unnamed: 0,ppno,project_name,current_phase,ea,project_description
184,7036,180 West Extension,PID,0Y230,Construct highway
233,6421,Caldwell I/C,PS&E\nR/W,48740,Reconstruct interchange
244,6421,Caldwell Interchange,PS&E,48740,"In Tulare County, from 0.6 mile south of the Avenue 280 (Caldwell Ave) Interchange to 0.7 mile north of the Ave 280 (Caldwell Ave) Interchange, reconstruct interchange."
220,6642,Chowchilla 99/233 I/C,PA&ED,0P910,Reconstruct interchange
245,6940,Commercial Ave Interchange,PS&E,0U880,In Tulare County near the City of Tulare at Commercial Avenue and State Route 99 between 0.9 mile north of Avenue 200 OC and Paige Avenue OC; Construct new interchange and construct north and south bound auxiliary lanes
193,6694,Grapevine I/C,PA&ED,0R100,Relocate I/C
203,6694,Grapevine Interchange,NONE,0R100,"In Kern County, near Grapevine from 0.7 mile south of the Grapevine UC to 0.5 mile south of the California Aqueduct, relocate interchange.\nThis project is mitigation to the Grapevine Specific and Community Plan EIR."
232,6940,International Agri-Center Way Interchange,PS&E\nR/W,0U880,Construct interchange
241,0104,Lindsay And Route 198/245 Operational Improvements,NONE,43080,"Near Lindsay at various locations from Avenue 224 (Lindmore Street) to west of Cedar Ave and on Route 198 at the SR 245 (Spruce Ave) junction, realignment and operational improvements. This project is not truly capacity increasing. The short amount of new pavement creates a shortcut bypassing local intersections. The project is not likely to induce additional demand. The additional expressway construction has been removed from the project."
229,0104,Lindsay And Rte 198/245 Operational Improvements,PA&ED,43080,Construct 4E along a 2-lane county road


### Add Fake Values

In [29]:
df_investment_plan2 = df_investment_plan.copy()

#### Create fake metrics

In [30]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [31]:
# Fake columns
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_trasnportation",
]

In [32]:
# Add fake metric columns
for i in fake_columns:
    df_investment_plan2[i] = np.round(
        np.random.uniform(0.0, 50.0, size=(len(df_investment_plan), 1)), 2
    )

#### Create fake benefit score and ranks.

In [33]:
# Create a fake benefit score
df_investment_plan2["fake_benefit_score"] = (
    (df_investment_plan2[fake_columns].sum(axis=1))
    / (df_investment_plan2['total_unfunded_need__$1,000_']).sum()
) * 10000

In [34]:
# Create a fake statewide project rank
df_investment_plan2["statewide_rank"] = df_investment_plan2["fake_benefit_score"].rank(
    ascending=False
)

In [35]:
df_investment_plan2["district_rank"] = df_investment_plan2.groupby("district")["fake_benefit_score"].rank(method="dense", ascending=False)

##### Double test district rank is what I expect

In [36]:
# D4 only projects
len(df_investment_plan2.loc[df_investment_plan2['district'] == 4])

97

In [37]:
# df_investment_plan2.loc[df_investment_plan2['district'] == 4][['fake_benefit_score','district_rank']].sort_values('district_rank')

#### Add median across districts
* project cost
* unfunded needs
* benefit score

### Portfolio Prep

#### Add Full County Names & Remap Districts to avoid 75/74

In [38]:
# Map full county names & districts 
# To avoid certain counties being under 75/74/etc
df_county = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}full_counties.xlsx")
)

df_county = (df_county
             .rename(columns = {'county':'full_county_name',
                               'district':'district_not_original_df'})
            )

In [39]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    df_county,
    left_on="county",
    right_on="abbrev_",
    how="left",
    indicator = True
)

#### Rename Districts to mimic portfolio

In [40]:
df_investment_plan2["district"] = df_investment_plan2["district"].astype("int64")

In [41]:
# Add official Caltrans District names
district_dictionary = {
    7: "07 - Los Angeles",
    4: "04 - Oakland",
    2: "02 - Redding",
    9: "09 - Bishop",
    10: "10 - Stockton",
    11: "11 - San Diego",
    3: "03 - Marysville",
    12: "12 - Irvine",
    8: "08 - San Bernardino",
    5: "05 - San Luis Obispo",
    6: "06 - Fresno",
    1: "01 - Eureka",
    75: "75 - HQ",
    74: "74 - HQ",
    0: "None",
}

In [42]:
df_investment_plan2["district"] = df_investment_plan2["district"].replace(
    district_dictionary
)

In [60]:
df_investment_plan2["full_county_name"] = df_investment_plan2["full_county_name"].fillna("Various")

In [63]:
# Check and make sure counties - district relationship makes sense. 
# df_investment_plan2[['county','full_county_name']].loc[df_investment_plan2['full_county_name'] == 'Various']

#### Drop unwanted columns

In [56]:
unwanted_cols = ['district_not_original_df',
       'abbrev_', '_merge', '_2023', '_2024', '_2025', '_2026', '_2027', '_2028', '_2029', '_2030',
       '_2031', '_2032', '_2033', 'number_of_counties',]

In [57]:
df_investment_plan2 = df_investment_plan2.drop(columns = unwanted_cols)

### Some more checks 

In [45]:
df_investment_plan2.project_name.nunique()

742

In [46]:
len(df_investment_plan2)

743

In [47]:
df_investment_plan2.district.value_counts().sum()

743

### Save

In [65]:
df_investment_plan2 = _utils.clean_up_columns(df_investment_plan2)

In [67]:
# df_investment_plan2.to_excel(f"{GCS_FILE_PATH}fake_data.xlsx", sheet_name='fake', index= False)