# 10 Year Non Shopp Data
* Generating fake data.
* Initial exploration.
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [70]:
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *
from shared_utils import portfolio_utils

In [71]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [72]:
import _utils

In [73]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

In [74]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(
        f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan Project List 11-7-22.xlsx"
    )
)



### Cleaning

In [75]:
df_investment_plan.shape

(743, 69)

In [76]:
# Look at nulls by % for each column
(df_investment_plan.isnull().sum() / len(df_investment_plan)) * 100

ea                                             55.181696
ppno                                           70.794078
ct_project_id                                  58.008075
district                                        0.672948
county                                          0.403769
route                                           2.153432
beg_pm                                         53.701211
end_pm                                         53.835801
urban_rural                                    15.074024
project_name                                    0.134590
project_description                             6.191117
hq_priority                                    88.829071
district_priority                              99.865410
previous_caltrans_nominations                   9.152086
primary_mode                                    0.134590
secondary_mode_s_                              66.218035
shs_capacity_increase_detail                   62.045760
potential_funding_program_s_   

In [77]:
# Clean up some of the values.
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan[
    "total_project_cost__$1,000_"
].replace(
    {
        "Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000": 344000,
        "$85,738 (Cycle 2)": 85738,
        "Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000": 35000,
        "$214,000\nPhase 1: $82,000\nPhase 2: $132,000": 214000,
        "$-": 0,
        "TBD": 0,
    }
)

In [78]:
# df_investment_plan["total_project_cost__$1,000_"].unique().tolist()

In [79]:
# Coerce dates to right format
date_cols = [
    "pid_approval_date",
    "rtl_date",
    "con_start_date",
]
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [80]:
# Fill na based on dtypes
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [81]:
# Titlecase some string columns
titlecase_cols = [
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "project_name",
    "potential_funding_program_s_",
]

In [82]:
for c in titlecase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.title().str.strip()

In [83]:
# Upper case phases & counties
uppercase_cols = ["current_phase", "county"]
for c in uppercase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.upper().str.strip()

In [84]:
# lower case phases & counties
lower_case = [
    "urban_rural",
]
for c in lower_case:
    df_investment_plan[c] = df_investment_plan[c].str.lower().str.strip()

In [85]:
# Correct some curent phase values that read in messily
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "ps&e\nr/w": "ps&e",
        "other\nplan/concept": "other\plan\concept",
        "pid\nplan/concept": "pid\plan\concept",
        "ps&e\nrow": "ps&e\row",
    }
)

#### Correct Projects that are in the wrong district

In [86]:
# df_investment_plan[['project_name','county','district']]

In [87]:
df_investment_plan.loc[
    (
        df_investment_plan["project_name"]
        == "Polb Terminal Island Wye Track Realignment"
    ),
    "district",
] = "7"

In [88]:
df_investment_plan.loc[
    (df_investment_plan["project_name"] == "Sr 58 Truck Climbing Lanes Segment 2"),
    "district",
] = "6"
df_investment_plan.loc[
    (df_investment_plan["project_name"] == "Mojave To Boron Freeway"), "district"
] = "6"
df_investment_plan.loc[
    (df_investment_plan["project_name"] == "Sr 58/California City Blvd. Extension"),
    "district",
] = "6"

### Investigate: Duplicated PPNO?

In [89]:
# Check that each row represents a different project
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")

ppno : 204
ct_project_id : 300
project_name : 742


In [90]:
# One project is included twice
df_investment_plan["project_name"].value_counts().head()

Delano To Pixely 6-Lane                                                       2
Crescent City Ada                                                             1
Rte 55 Btw Sr 73 And I-5, Managed Lanes Project                               1
Reconnecting Grand Avenue "Complete Streets To Transit" And Safety Project    1
I-5 Btw Rte 73 And Sd Co, Managed Lanes Project                               1
Name: project_name, dtype: int64

In [91]:
# Check and make sure there aren't duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [92]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [93]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [94]:
# Check out these rows in our df
# are these duplicates?
len(df_investment_plan[df_investment_plan["ppno"].isin(duplicate_ppno_list)])

24

In [95]:
duplicate_projects_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]

In [96]:
"""
duplicate_projects_df[
    ["ppno", "project_name", "current_phase", "ea", "project_description"]
].sort_values('project_name')"""

'\nduplicate_projects_df[\n    ["ppno", "project_name", "current_phase", "ea", "project_description"]\n].sort_values(\'project_name\')'

### Add Fake Values

In [97]:
df_investment_plan2 = df_investment_plan.copy()

#### Create fake metrics

In [98]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [99]:
# Fake columns
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_transportation",
]

In [100]:
# Add fake metric columns
for i in fake_columns:
    df_investment_plan2[i] = np.round(
        np.random.uniform(0.0, 50.0, size=(len(df_investment_plan), 1)), 2
    )

#### Create fake benefit score and ranks.

In [101]:
# Create a fake benefit score
df_investment_plan2["fake_benefit_score"] = (
    (df_investment_plan2[fake_columns].sum(axis=1))
    / (df_investment_plan2["total_unfunded_need__$1,000_"]).sum()
) * 1000000

In [102]:
# Create a fake statewide project rank
df_investment_plan2["statewide_rank"] = df_investment_plan2["fake_benefit_score"].rank(
    ascending=False
)

In [103]:
df_investment_plan2["district_rank"] = df_investment_plan2.groupby("district")[
    "fake_benefit_score"
].rank(method="dense", ascending=False)

##### Double test district rank is what I expect

In [104]:
# D4 only projects
len(df_investment_plan2.loc[df_investment_plan2["district"] == 4])

97

In [105]:
# df_investment_plan2.loc[df_investment_plan2['district'] == 4][['fake_benefit_score','district_rank']].sort_values('district_rank')

### Add median across districts
* project cost
* unfunded needs
* benefit score

In [106]:
df_investment_plan2["district"] = df_investment_plan2["district"].astype("int64")

In [107]:
# Grab medians
summary_district_state = (
    df_investment_plan2.groupby(["district"])
    .agg(
        {
            "total_project_cost__$1,000_": "median",
            "fake_benefit_score": "median",
            "total_unfunded_need__$1,000_": "median",
            "csis_total_score__out_of_45_": "median",
            "atp_total_score__out_of_100_": "median"
        }
    )
    .reset_index()
)

In [108]:
# Add suffixes
summary_district_state = summary_district_state.add_suffix("_district_median_")

In [109]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    summary_district_state,
    left_on="district",
    right_on="district_district_median_",
    how="left",
)

### District/County

#### Add Full County Names & Remap Districts to avoid 75/74

In [110]:
# Map full county names 
df_county = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}full_counties.xlsx"))

df_county = df_county.rename(
    columns={
        "county": "full_county_name",
    }
).drop(columns=["district"])

In [111]:
# Merge 
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    df_county,
    left_on="county",
    right_on="abbrev_",
    how="left",
    indicator=True,
)

In [112]:
# Fill full_county_name column with none if it's none in the county column
df_investment_plan2.loc[
    (df_investment_plan2["county"] == "NONE"),
    "full_county_name",
] = "None"

In [113]:
# Any projects that don't have a full county name cross Various counties
# Fill N/A with various
df_investment_plan2["full_county_name"] = df_investment_plan2[
    "full_county_name"
].fillna("Various")

In [114]:
# Check and make sure counties - district relationship makes sense.
# df_investment_plan2[['county','full_county_name']].loc[df_investment_plan2['full_county_name'] == 'Various']

#### Rename Districts to mimic portfolio

In [115]:
# Add official Caltrans District names
district_dictionary = {
    7: "07 - Los Angeles",
    4: "04 - Oakland",
    2: "02 - Redding",
    9: "09 - Bishop",
    10: "10 - Stockton",
    11: "11 - San Diego",
    3: "03 - Marysville",
    12: "12 - Irvine",
    8: "08 - San Bernardino",
    5: "05 - San Luis Obispo",
    6: "06 - Fresno",
    1: "01 - Eureka",
    75: "75 - HQ",
    74: "74 - HQ",
    0: "None",
}

In [116]:
df_investment_plan2["district"] = df_investment_plan2["district"].replace(
    district_dictionary
)

#### Drop unwanted columns

In [117]:
unwanted_cols = [
    "abbrev_",
    "_merge",
    "_2023",
    "_2024",
    "_2025",
    "_2026",
    "_2027",
    "_2028",
    "_2029",
    "_2030",
    "_2031",
    "_2032",
    "_2033",
    "district_district_median_",
]

In [118]:
df_investment_plan2 = df_investment_plan2.drop(columns=unwanted_cols)

### Some more checks 

In [119]:
df_investment_plan2.project_name.nunique(), df_investment_plan.project_name.nunique()

(742, 742)

In [120]:
len(df_investment_plan2)

743

In [121]:
df_investment_plan2.district.value_counts().sum()

743

### Save

In [122]:
df_investment_plan2 = _utils.clean_up_columns(df_investment_plan2)

In [123]:
df_investment_plan2.head(1)

Unnamed: 0,Ea,Ppno,Ct Project Id,District,County,Route,Beg Pm,End Pm,Urban Rural,Project Name,Project Description,Hq Priority,District Priority,Previous Caltrans Nominations,Primary Mode,Secondary Mode S,Shs Capacity Increase Detail,Potential Funding Program S,Notes,Lead Agency,Current Phase,Pid Approval Date,Target Pa Ed,Rtl Date,Con Start Date,Funding Need Phase S,"Total Project Cost $1,000","Pa Ed Cost $1,000","Ps E Cost $1,000","Row Cost $1,000","Con Cost $1,000","Non Infrastructure Plan Cost $1,000","Total Unfunded Need $1,000",Previous Funding Request,Previous Funding Request Phase,Last Scored,Mode Shift Csis,Vmt Csis,Dac Local Community Needs Csis,Public Engagement Csis,Safety Csis,Zev Csis,Climate Resiliency Csis,Natural Resources And Ecosystems Csis,Infill Development And Land Use Csis,Csis Total Score Out Of 45,Csis Alignment,Benefits To Dac And Advancing Equity Atp,Community Need Atp,Safety Atp,Public Participation Atp,Community Feedback Atp,Continued Engagement Atp,Context Sensitive And Innovation Atp,Transformative Atp,Atp Total Score Out Of 100,Atp Alignment,Access Alignment,Increase Peak Person Throughput,Reduction In Peak Period Delay,Reduction In Fatal And Injury Crashes,Reduction In Injury Rates,Increase Access To Jobs,Increase Access Jobs To Dac,Commercial Dev Developed,Tons Of Goods Impacted,Improve Air Quality,Impact Natural Resources,Support Of Transportation,Fake Benefit Score,Statewide Rank,District Rank,"Total Project Cost $1,000 District Median",Fake Benefit Score District Median,"Total Unfunded Need $1,000 District Median",Csis Total Score Out Of 45 District Median,Atp Total Score Out Of 100 District Median,Full County Name
0,0C660,1095,113000023,01 - Eureka,DN,,24.4,R27.80,rural,Crescent City Ada,"Traffic calming, non-motorized and multi-modal improvements. Enhance safety for pedestrians and non-motorized vehicles",0.0,0.0,SHOPP,Complete Streets,Bike/Pedestrian,,At Infra Invest\nAtp\nRaise,PID Completed,Caltrans,CLOSEOUT,datetime64[ns],,datetime64[ns],datetime64[ns],,4100.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Not Well-Aligned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,Not Well-Aligned,,47.07,38.24,38.77,39.56,11.96,20.35,25.37,4.59,22.02,42.52,5.77,46.282328,258.0,11.0,5071.0,43.990242,0.0,0.0,0.0,Del Norte


In [124]:
"""
df_investment_plan2.to_excel(
    f"{GCS_FILE_PATH}fake_data.xlsx", sheet_name="fake", index=False
)"""

'\ndf_investment_plan2.to_excel(\n    f"{GCS_FILE_PATH}fake_data.xlsx", sheet_name="fake", index=False\n)'