# 10 Year Non Shopp Data
* Generating fake data.
* Cleaning the current data.
* Initial exploration.
* [Using Smart Sheet.](https://app.smartsheet.com/workspaces/8MgpHcXR4GJVM5GvWWMmQg7M8gqhJj88Gfh54Pr1)

In [1]:
import numpy as np
import pandas as pd
from calitp import *

In [2]:
import _utils
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/project_prioritization/"



In [3]:
pd.options.display.max_columns = 150
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Opening the File
<b> Google Sheets/ Smart Sheet </b> 
* Connecting to Smartsheet directly requires a premium account/subscription to Smartsheet to generate the API key. 
* Can export a Smartsheet to Google Sheets, but have to do it manually each time and move it from my Cal-ITP account to the Data Services Team drive. 
    * This is just as time consuming as downloading the sheet in Excel and uploading it to GCS.
    * Csv file from Google Sheets doesn't read into pandas  properly. It downloads looking alright.
* Easiest just to read the downloade Excel sheet from GCS.

In [4]:
# Open sheet with Google Sheets
sheet_id = "1O0rLyt96El6RQkVu5SYl0CwWqQRysryQn6_xANUYFkw"
sheet_name = "main"

In [5]:
# url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [6]:
# csv = pd.read_csv(url, sep='\t')

In [7]:
# Open first sheet.
df_investment_plan = to_snakecase(
    pd.read_excel(
        f"{GCS_FILE_PATH}10 Year Non-SHOPP Investment Plan Project List.xlsx"
    )
)



### Cleaning
#### Preview

In [8]:
df_investment_plan.shape

(765, 90)

In [9]:
# df_investment_plan[["_10_year_plan"]].sample(30)

In [10]:
# Drop columns that are already unneeded
df_investment_plan = df_investment_plan.drop(columns = ["modified", "modified_by"])

In [11]:
# Look at nulls by % for each column
(df_investment_plan.isnull().sum() / len(df_investment_plan)) * 100

rail_project_id                                      73.856209
_10_year_plan                                        99.738562
ct_project_id                                        56.209150
update_complete                                      38.169935
ea                                                   54.248366
ppno                                                 68.627451
project_name                                          0.000000
district                                              0.000000
county                                                0.000000
route                                                 1.568627
project_description                                   2.483660
current_phase                                         0.000000
con_existing_source_s__of_funds                      55.816993
con_anticipated_source_of_funds                      58.169935
target_opening_year                                  54.248366
beg_pm                                               45

In [12]:
# df_investment_plan.info()

#### Fix monetary & date values 

In [13]:
# df_investment_plan["total_project_cost__$1,000_"].unique().tolist()

In [14]:
# Manually clean up some values that are strings
df_investment_plan["total_project_cost__$1,000_"] = df_investment_plan["total_project_cost__$1,000_"
].replace(
    {
        "Phase 1 (SR-60 to Mission Blvd) - $174,000\nPhase 2 (Mission Blvd to I-10) - $170,000": 344000,
        "$85,738 (Cycle 2)": 85738,
        "Oxnard to Camarillo 2nd Main Track: $15,000\nOxnard Station North Platform: $20,000": 35000,
        "$214,000\nPhase 1: $82,000\nPhase 2: $132,000": 214000,
        "$-": 0,
        "TBD": 0,
        "Info not currently available":0,
    }
)

In [15]:
# Coerce cost and score columns into numeric.
# https://stackoverflow.com/questions/21285380/find-column-whose-name-contains-a-specific-string
cost_score_cols = [col for col in df_investment_plan.columns if any(s in col for s in ['score','cost','need'])]

In [16]:
for c in cost_score_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_numeric, errors = 'coerce').fillna(0)

In [17]:
# df_investment_plan.info()

In [18]:
# Find all date related cols
date_cols = [col for col in df_investment_plan.columns if 'date' in col]

In [19]:
# Remove update complete column from this list.
date_cols.remove('update_complete')

In [20]:
# Change to the correct format. 
for c in date_cols:
    df_investment_plan[c] = df_investment_plan[c].apply(pd.to_datetime, errors="coerce")

In [21]:
# Fill NA based on dtypes
df_investment_plan = df_investment_plan.fillna(
    df_investment_plan.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [22]:
# Filling in na by data type didn't work with route
df_investment_plan["route"] = df_investment_plan["route"].fillna('None')

#### Neaten up String Columns

In [23]:
# df_investment_plan["current_phase"].value_counts()

In [24]:
# Correct some curent phase values that read in messily
df_investment_plan["current_phase"] = df_investment_plan["current_phase"].replace(
    {
        "REMOVE/END\nSHELF": "remove/end/shelf",
        "INACTIVE\nON HOLD": "inactive/onhold",
        "INACTIVE\nPS&E/ROW": "inactive/ps&e/row",
    }
)

In [25]:
# Titlecase certain string columns
titlecase_cols = [
    "route",
    "potential_funding_program_s_",
    "lead_agency",
    "primary_mode",
    "project_name",
    "potential_funding_program_s_",
    "urban_rural", 
]

# Title case certain columns
for c in titlecase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.title().str.strip()

In [26]:
# Upper case phases & counties
uppercase_cols = ["county","current_phase"]
for c in uppercase_cols:
    df_investment_plan[c] = df_investment_plan[c].str.upper().str.strip()

#### Correct Projects that are in the wrong district
* Maybe need a more robust way to match the county to the district automatically.
* https://cwwp2.dot.ca.gov/documentation/district-map-county-chart.htm

In [27]:
# df_investment_plan[['project_name','county','district']]

In [28]:
df_investment_plan.loc[
    (
        df_investment_plan["project_name"]
        == "Polb Terminal Island Wye Track Realignment"
    ),
    "district",
] = "7"

In [29]:
# These are all projects in Kern County, which is district 6. 
# However they are tagged as district 9. 
df_investment_plan.loc[
    (df_investment_plan["project_name"] == "Sr 58 Truck Climbing Lanes Segment 2"),
    "district",
] = "6"
df_investment_plan.loc[
    (df_investment_plan["project_name"] == "Mojave To Boron Freeway"), "district"
] = "6"
df_investment_plan.loc[
    (df_investment_plan["project_name"] == "Sr 58/California City Blvd. Extension"),
    "district",
] = "6"

#### Ensure Lead Agencies are not duplicated
* Does Metro correspond with LA Metro or Metrolink?
* Additionally, some agency names are spelled out while others are referred to as acronyms. 
* Maybe this should be standardized: either all acronyms or all spelled out. 


In [30]:
agency_dict = {'Drmt':'Caltrans',
  'Mtc':'Mtc - Bay Area Toll Authority',
  'Pola':'Port of Los Angeles',
  'Polb':'Port of Long Beach',
  'Rctc':'Riverside County Transportation Commission',
  'Vctc':'Ventura County Transportation Commission',
  'Vta': 'Santa Clara Valley Transportation Authority (Vta)'}

In [31]:
df_investment_plan["lead_agency"] = df_investment_plan["lead_agency"].replace(agency_dict)

In [32]:
# df_investment_plan.lead_agency.sort_values().unique().tolist()

In [33]:
# df_investment_plan.lead_agency.value_counts()

#### Add Full County Names 
* Currently, the counties are only referred to as ALA or KER.
* Change this to be Alameda or Kern.

In [34]:
# Load Excel workbook with abbreviations and full county names
# https://cwwp2.dot.ca.gov/documentation/district-map-county-chart.htm
df_county = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}full_counties.xlsx"))
df_county = df_county.rename(columns={"county": "full_county_name"}).drop(columns=["district"])

In [35]:
# Merge df_county with dataframe.
df_investment_plan = pd.merge(
    df_investment_plan,
    df_county,
    left_on="county",
    right_on="abbrev_",
    how="left",
    indicator=True,
)

In [36]:
# Check left only merges to make sure these are multiple/various county projects
# df_investment_plan.loc[df_investment_plan._merge == "left_only"][['county']]

In [37]:
# Fill full_county_name column with none if it's none in the county column
df_investment_plan.loc[
    (df_investment_plan["county"] == "NONE"),
    "full_county_name",
] = "None"

In [38]:
# Any projects that don't have a full county name because the project crosses
# various counties, fill it with Various
df_investment_plan["full_county_name"] = df_investment_plan[
    "full_county_name"
].fillna("Various")

In [39]:
# First check
# df_investment_plan[["county","abbrev_","full_county_name"]].drop_duplicates().sort_values('abbrev_')

In [40]:
# Second check
# df_investment_plan[['county','full_county_name']].loc[df_investment_plan['full_county_name'] == 'Various']

#### Create new project titles: District + Title
* Roy's request: wants the dropdown menu to show projects by by districts first in ascending order. 

In [41]:
# Cast district as all integers 
df_investment_plan.district = df_investment_plan.district.astype('int64')

In [42]:
# Sort values by District
df_investment_plan = df_investment_plan.sort_values(by = ['district']).reset_index(drop = True) 

In [43]:
# New column with project titles
df_investment_plan["detailed_project_title"] = ('District '+ df_investment_plan.district.astype('str') + '-' + df_investment_plan.project_name)

In [44]:
# df_investment_plan["detailed_project_title"].sample(4).tolist()

In [45]:
# df_investment_plan[["detailed_project_title"]]

#### Rename Districts to mimic portfolio

In [46]:
# Add official Caltrans District names
district_dictionary = {
    7: "07 - Los Angeles",
    4: "04 - Oakland",
    2: "02 - Redding",
    9: "09 - Bishop",
    10: "10 - Stockton",
    11: "11 - San Diego",
    3: "03 - Marysville",
    12: "12 - Irvine",
    8: "08 - San Bernardino",
    5: "05 - San Luis Obispo",
    6: "06 - Fresno",
    1: "01 - Eureka",
    75: "75 - HQ",
    74: "74 - HQ",
    0: "None",
}

In [47]:
# Create a new column called district full name.
df_investment_plan["district_full_name"] = df_investment_plan["district"].replace(
    district_dictionary
)

In [48]:
df_investment_plan.to_excel(
 f"{GCS_FILE_PATH}cleaned_data_with_fake_metrics.xlsx", sheet_name="fake", index=False)

### Investigate: Duplication
* Rows of projects corresponds exactly with project names

In [None]:
# Subset of df for when I just want to preview pertinent info.
preview_cols = ["district", "ppno", "project_name", "current_phase", "ct_project_id", "ea", "project_description", 'total_project_cost__$1,000_']

In [None]:
# Check that each row represents a different project
for i in ["ppno", "ct_project_id", "project_name"]:
    print(f"{i} : {df_investment_plan[i].nunique()}")
f"The dataframe contains {len(df_investment_plan)} rows."

In [None]:
len(df_investment_plan.loc[df_investment_plan.ppno == "None"])/len(df_investment_plan)

In [None]:
len(df_investment_plan.loc[df_investment_plan.ct_project_id == "None"])/len(df_investment_plan)

#### PPNO Investigation
* Some projects are 1:1 matches such as South Fresno State Route 99 Corridor Project and South Fresno I/C Project
* Others are not as clear. Like Sr 233 Chowchilla Interchange Improvement\nMad 99/233 Chowchilla Interchange Improvement and Chowchilla 99/233 I/C	

In [None]:
# Check the duplicate PPNO-Project Name combos
duplicate_ppno = (
    df_investment_plan[~df_investment_plan["ppno"].isin([0, "None", "TBD"])]
    .groupby("ppno")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [None]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_ppno = duplicate_ppno[duplicate_ppno["project_name"] > 1]

In [None]:
duplicate_ppno_list = duplicate_ppno["ppno"].to_list()

In [None]:
duplicate_ppno_df = df_investment_plan[
    df_investment_plan["ppno"].isin(duplicate_ppno_list)
]

In [None]:
duplicate_ppno_df.project_name.nunique()

In [None]:
duplicate_ppno_df[preview_cols].sort_values('district')

#### Project ID Investigation
* Overlaps with PPNO.

In [None]:
# Check the duplicate Project ID-Project Name combos
duplicate_project_ids = (
    df_investment_plan[~df_investment_plan["ct_project_id"].isin([0, "None", "TBD"])]
    .groupby("ct_project_id")
    .agg({"project_name": "count"})
    .sort_values("project_name")
    .reset_index()
)

In [None]:
# Filter out for ppno with more than 2 project names.
# Save those projects into a list
duplicate_projects_ids = duplicate_project_ids[duplicate_project_ids["project_name"] > 1]

In [None]:
duplicate_project_id_list = duplicate_projects_ids["ct_project_id"].to_list()

In [None]:
duplicate_project_id_df = df_investment_plan[
    df_investment_plan["ct_project_id"].isin(duplicate_project_id_list)
]

In [None]:
duplicate_project_id_df[preview_cols].sort_values('district')

In [None]:
project_id_set = set(duplicate_project_id_df.project_name.unique().tolist())
ppno_set = set(duplicate_ppno_df.project_name.unique().tolist())

In [None]:
project_id_set - ppno_set

In [None]:
ppno_set - project_id_set 

### DO NOT RUN PAST HERE - Add Fake Values

#### Create fake metrics

In [None]:
# Duplicate the dataframe in case we don't want to use a copy with the fake metrics
df_investment_plan2 = df_investment_plan.copy()

In [None]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [None]:
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_transportation",
]

In [None]:
# Add fake metric columns: I want a random value between 0 to 50. 
for i in fake_columns:
    df_investment_plan2[i] = np.round(np.random.uniform(0.0, 50.0, size=(len(df_investment_plan2), 1)), 2)

In [None]:
# df_investment_plan2.sample(1)

#### Create fake benefit score and ranks.

In [None]:
# Create the fake benefit score based off of Virginia DOT.
df_investment_plan2["fake_benefit_score"] = ((df_investment_plan2[fake_columns].sum(axis=1))/ 
                                             (df_investment_plan2["total_unfunded_need__$1,000_"]).apply(pd.to_numeric, errors = 'coerce').fillna(0).sum()
) * 1000000

In [None]:
# Create a fake statewide project rank
df_investment_plan2["statewide_rank"] = df_investment_plan2["fake_benefit_score"].rank(
    ascending=False
)

In [None]:
# Create fake project rank by district
df_investment_plan2["district_rank"] = df_investment_plan2.groupby("district")[
    "fake_benefit_score"
].rank(method="dense", ascending=False)

In [None]:
# Assign percentile of project among all projects in the state. 
df_investment_plan2 = _utils.project_size_rating(df_investment_plan2, 'statewide_rank', 'fake_benefit_score_statewide_percentile')

In [None]:
# Make sure this makes sense
# df_investment_plan2[['district_rank','statewide_rank','fake_benefit_score','fake_benefit_score_statewide_percentile']].sort_values('fake_benefit_score', ascending= False)

##### Double test district rank is what I expect

In [None]:
# D4 only projects
len(df_investment_plan2.loc[df_investment_plan2["district"] == 4])

In [None]:
# df_investment_plan2.loc[df_investment_plan2['district'] == 4][['fake_benefit_score','district_rank']].sort_values('district_rank')

#### Add median across districts
Suggestion from Nick to add medians across the district. 
* Project cost
* unfunded needs
* benefit score

In [None]:
# Grab medians
summary_district_state = (
    df_investment_plan2.groupby(["district"])
    .agg(
        {
            "total_project_cost__$1,000_": "median",
            "fake_benefit_score": "median",
            "total_unfunded_need__$1,000_": "median",
            "csis_total_score__out_of_45_": "median",
            "atp_total_score__out_of_100_": "median"
        }
    )
    .reset_index()
)

In [None]:
# Add suffixes
summary_district_state = summary_district_state.add_suffix("_district_median_")

In [None]:
df_investment_plan2 = pd.merge(
    df_investment_plan2,
    summary_district_state,
    left_on="district",
    right_on="district_district_median_",
    how="left",
)

### Final checks before Saving

In [None]:
# Make sure the lengths are correct
df_investment_plan2.project_name.nunique(), df_investment_plan2.project_name.nunique(), len(df_investment_plan2)

In [None]:
unwanted_cols = [
    "abbrev_",
    "_merge",
    "_2023",
    "_2024",
    "_2025",
    "_2026",
    "_2027",
    "_2028",
    "_2029",
    "_2030",
    "_2031",
    "_2032",
    "_2033",
]

In [None]:
df_investment_plan2 = df_investment_plan2.drop(columns=unwanted_cols)

In [None]:
df_investment_plan2 = _utils.clean_up_columns(df_investment_plan2)

In [None]:
df_investment_plan2.isnull().sum()