# Exploring 10 Year Non Shopp Data
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [None]:
import numpy as np
import pandas as pd
from calitp import *

In [None]:
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
import _utils

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"

## Investment Plan

In [None]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan.xlsx")
)

### Look at null values for each column

In [None]:
df_investment_plan.info()

### Some Cleaning

In [None]:
df_investment_plan["total_project_cost__$1,000_"] = (df_investment_plan["total_project_cost__$1,000_"].replace(
    {'Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000': 344000,
     '$85,738 (Cycle 2)':85738,
     'Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000':35000,
     '$214,000\nPhase 1: $82,000\nPhase 2: $132,000': 214000,
     '$-':0,
     'TBD':0,}))

In [None]:
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan["total_project_cost__$1,000_"].astype("float64")

In [None]:
# Coerce dates
date_cols = [
    "pid_approval_date",
    "rtl_date",
    "con_start_date",
]
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [None]:
# Fill na based on types
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [None]:
# Lowercase string columns
string_cols = [
    "county",
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "project_description",
    "urban_rural",
    "current_phase",
    "project_name",
    "potential_funding_program_s_",
]

In [None]:
for c in string_cols:
    df_investment_plan[c] = df_investment_plan[c].str.lower()

In [None]:
# Correct some curent phase values that read in weirdly
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "ps&e\nr/w": "ps&e",
        "other\nplan/concept": "other\plan\concept",
        "pid\nplan/concept": "pid\plan\concept",
        "ps&e\nrow": "ps&e\row",
    }
)

### Duplicated PPNO?

In [None]:
# Check that each row represents a different project
# PPNO and Project ID have a lot less unique value b/c 70% and 56% of each col are not filled in
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")

In [None]:
# One project is included twice
df_investment_plan["project_name"].value_counts().head()

In [None]:
# Check and make sure there aren't duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [None]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [None]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [None]:
# Check out these rows in our df
# are these duplicates?
len(df_investment_plan[df_investment_plan["ppno"].isin(duplicate_ppno_list)])

In [None]:
duplicate_projects_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]
duplicate_projects_df[
    ["ppno", "project_name", "current_phase", "ea", "project_description"]
]

### Some Analysis
#### Are projects in rural or urban areas? 
* What defines rural versus urban?

In [None]:
rural_urban_projects = _utils.value_counts_df(df_investment_plan, "urban_rural").rename(
    columns={"urban_rural": "Total Projects", "index": "Urban/Rural"}
)

In [None]:
_utils.basic_bar_chart(
    rural_urban_projects, "Urban/Rural", "Total Projects", "Urban/Rural"
)

#### Most Common Lead Agencies.

In [None]:
# Filter out for lead agency that says "none"
df_investment_plan[df_investment_plan["lead_agency"] != "none"].groupby(
    ["lead_agency"]
).agg({"project_name": "count"}).sort_values("project_name", ascending=False).head(
    10
).style.bar(
    subset=["project_name"], color="#8CBCCB"
).set_properties(
    **{"background-color": "white"}
).set_table_styles(
    [dict(selector="th", props=[("text-align", "center")])]
).set_properties(
    **{"text-align": "center"}
)

#### Primary Mode

In [None]:
_utils.value_counts_df(df_investment_plan, "primary_mode").rename(
    columns={"primary_mode": "Total Projects", "index": "Primary Mode"}
).sort_values("Total Projects", ascending=False).head(10).style.bar(
    subset=["Total Projects"], color="#8CBCCB"
).set_properties(
    **{"background-color": "white"}
).set_table_styles(
    [dict(selector="th", props=[("text-align", "center")])]
).set_properties(
    **{"text-align": "center"}
)

#### Costs

In [None]:
(df_investment_plan
 .groupby("primary_mode")
 .agg({"total_project_cost__$1,000_":"sum"})
 .sort_values("total_project_cost__$1,000_", ascending = False)
)