# Exploring 10 Year Non Shopp Data
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [None]:
import numpy as np
import pandas as pd
from babel.numbers import format_currency
from calitp import *
from shared_utils import portfolio_utils

In [None]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
import _utils

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

## Investment Plan

In [None]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan.xlsx")
)

### Look at null values for each column

In [None]:
df_investment_plan.info()

### Some Cleaning

In [None]:
# Clean up monetary columns
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan[
    "total_project_cost__$1,000_"
].replace(
    {
        "Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000": 344000,
        "$85,738 (Cycle 2)": 85738,
        "Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000": 35000,
        "$214,000\nPhase 1: $82,000\nPhase 2: $132,000": 214000,
        "$-": 0,
        "TBD": 0,
    }
)

In [None]:
# Fill in total project cost with fake number and cast to float64
df_investment_plan["total_project_cost__$1,000_"] = (
    df_investment_plan["total_project_cost__$1,000_"].astype("float64").fillna(888888.0)
)

In [None]:
# Coerce dates to right format
date_cols = [
    "pid_approval_date",
    "rtl_date",
    "con_start_date",
]
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [None]:
# Fill na based on types
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [None]:
# Titlecase string columns
string_cols = [
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "urban_rural",
    "current_phase",
    "project_name",
    "potential_funding_program_s_",
]

In [None]:
for c in string_cols:
    df_investment_plan[c] = df_investment_plan[c].str.title().str.strip()

In [None]:
# Correct some curent phase values that read in weirdly
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "ps&e\nr/w": "ps&e",
        "other\nplan/concept": "other\plan\concept",
        "pid\nplan/concept": "pid\plan\concept",
        "ps&e\nrow": "ps&e\row",
    }
)

# Upper case phases & counties
uppercase_cols = ["current_phase", "county"]
for c in uppercase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.upper().str.strip()

In [None]:
# Clean up counties
def count_by_commas(df, col_to_count: str, new_col_name:str): 
    df[new_col_name] = (
    df[col_to_count]
    .apply(lambda x: len(x.split(",")))
    .astype("int64")) 
    return df 

In [None]:
# Count number of counties this project crosses
df_investment_plan = count_by_commas(df_investment_plan, 'county','number_of_counties')

In [None]:
# New column that summarizes whether a project is in one or more counties
def various_counties(row):
    if row["number_of_counties"] == 1:
        return "One County"
    else:
        return "Multiple Counties"

In [None]:
df_investment_plan['various_or_single_county'] = df_investment_plan.apply(lambda x: various_counties(x), axis=1)

In [None]:
df_investment_plan.head(1)

### Correct Projects that are in the wrong district

In [None]:
# Correct the LA project that is coded in district 8 mistakenly
# df_investment_plan[(df_investment_plan["full_county_name"] == 'Los Angeles') & (df_investment_plan['district'] == 8)]['project_name']

In [None]:
df_investment_plan.loc[(df_investment_plan["project_name"] == "Polb Terminal Island Wye Track Realignment"), "district"] = "7"

In [None]:
# Correct Kern from bishop 9 to fresno 6
# df_investment_plan[(df_investment_plan["full_county_name"] == 'Kern') & (df_investment_plan['district'] == 9)]['project_name'].unique()

In [None]:
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Sr 58 Truck Climbing Lanes Segment 2'), "district"] = "6"
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Mojave To Boron Freeway'), "district"] = "6"
df_investment_plan.loc[(df_investment_plan["project_name"] == 'Sr 58/California City Blvd. Extension'), "district"] = "6"

In [None]:
df_investment_plan.groupby(['county', 'district']).agg({'ppno':'nunique'})

### Duplicated PPNO?

In [None]:
# Check that each row represents a different project
# PPNO and Project ID have a lot less unique value b/c 70% and 56% of each col are not filled in
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")

In [None]:
# One project is included twice
df_investment_plan["project_name"].value_counts().head()

In [None]:
# Check and make sure there aren't duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [None]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [None]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [None]:
# Check out these rows in our df
# are these duplicates?
len(df_investment_plan[df_investment_plan["ppno"].isin(duplicate_ppno_list)])

In [None]:
duplicate_projects_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]

In [None]:
duplicate_projects_df[
    ["ppno", "project_name", "current_phase", "ea", "project_description"]
]

### Add Fake Values

In [None]:
# Subset
df_investment_plan2 = df_investment_plan[
    [
        "ppno",
        "county",
        "various_or_single_county",
        "ct_project_id",
        "district",
        "route",
        "urban_rural",
        "project_name",
        "current_phase",
        "project_description",
        "priority_project",
        "previous_caltrans_nominations",
        "primary_mode",
        "lead_agency",
        "pid_approval_date",
        "target_pa_ed",
        "rtl_date",
        "con_start_date",
        "funding_need_phase_s_",
        "total_project_cost__$1,000_",
        "potential_funding_program_s_",
    ]
]

In [None]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [None]:
# Fake columns
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_trasnportation",
]

In [None]:
# Add fake metric columns
for i in fake_columns:
    df_investment_plan2[i] = np.round(
        np.random.uniform(0.0, 50.0, size=(len(df_investment_plan), 1)), 2
    )

In [None]:
# Create a column for fake funds requested - there only seems to be a total project cost col?
df_investment_plan2["current_fake_fund_requested"] = np.round(
    np.random.uniform(100000, 500000.0, size=(len(df_investment_plan), 1)), 2
)

In [None]:
# Create a fake benefit score
df_investment_plan2["fake_benefit_score"] = (
    (df_investment_plan2[fake_columns].sum(axis=1))
    / (df_investment_plan2["current_fake_fund_requested"])
) * 10000

In [None]:
# Create a fake statewide project rank
df_investment_plan2["statewide_rank"] = df_investment_plan2["fake_benefit_score"].rank(
    ascending=False
)

In [None]:
# Create a fake district project rank
df_investment_plan2["district_rank"] = df_investment_plan2.groupby("district")[
    "fake_benefit_score"
].rank(ascending=False)

In [None]:
# Add agency names
agency_names = portfolio_utils.add_agency_name()

In [None]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    agency_names,
    left_on="lead_agency",
    right_on="calitp_agency_name",
    how="left",
)

In [None]:
df_investment_plan2["district"] = df_investment_plan2["district"].astype("int64")

In [None]:
# Add official Caltrans District names
district_dictionary = {
    7: "07 - Los Angeles",
    4: "04 - Oakland",
    2: "02 - Redding",
    9: "09 - Bishop",
    10: "10 - Stockton",
    11: "11 - San Diego",
    3: "03 - Marysville",
    12: "12 - Irvine",
    8: "08 - San Bernardino",
    5: "05 - San Luis Obispo",
    6: "06 - Fresno",
    1: "01 - Eureka",
    75: "75 - HQ",
    74: "74 - HQ",
    0: "None",
}

In [None]:
df_investment_plan2["district_full_name"] = df_investment_plan2["district"].replace(
    district_dictionary
)

In [None]:
# Map full county names
df_county = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}full_counties.xlsx")
)

# Del district from county
df_county = (df_county.drop(columns = ['district'])
             .rename(columns = {'county':'full_county_name'})
            )

In [None]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    df_county,
    left_on="county",
    right_on="abbrev_",
    how="left",
)

In [None]:
df_investment_plan2.columns

In [None]:
df_investment_plan2["full_county_name"] = df_investment_plan2["full_county_name"].fillna("Various")

In [None]:
# Check and make sure counties - district relationship makes sense. 
df_investment_plan2.groupby(['full_county_name', 'district_full_name']).agg({'ppno':'nunique'})

In [None]:
# df_investment_plan2.to_excel(f"{GCS_FILE_PATH}fake_data.xlsx", sheet_name='fake', index= False)