## Add TIRCP

In [None]:
import _utils
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

In [None]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# import fuzzywuzzy
# from fuzzywuzzy import process

### ATP - 10 Year Non SHOPP Prep

In [None]:
# Read in 10 Year non SHOPP
atp_shopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_with_fake_metrics_plus_atp.xlsx")
)

In [None]:
# list(atp_shopp.columns)

In [None]:
# Subset
atp_shopp_subset = [
    "ppno",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "notes",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [None]:
# Column for testing stuff
atp_shopp2 = atp_shopp[atp_shopp_subset]

In [None]:
# atp_shopp2.sample()

In [None]:
# Function to clean agency/organization names
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip()  # strip again after getting rid of certain things
    )
    return df

In [None]:
# Lowercase previous caltrans nominations
atp_shopp2.previous_caltrans_nominations = (
    atp_shopp2.previous_caltrans_nominations.str.lower()
)

In [None]:
# Clean ATP
atp_shopp2 = organization_cleaning(atp_shopp2, "lead_agency")

In [None]:
# atp_shopp2.head(100)

### TIRCP Prep
* Filter out projects in which total cost equals total award 

In [None]:
# Read in TIRCP. Last updated November 2022.
tircp = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx"
    )
)

In [None]:
# tircp.columns

In [None]:
# Subset TIRCP with only the basic information
tircp_subset = [
    "award_year",
    "grant_recipient",
    "title",
    "ppno",
    "district",
    "county",
    "description",
    "total__cost",
    "tircp",
    "award_cycle",
    "on_shs?",
    "comments_additional_contacts",
]

In [None]:
tircp2 = tircp[tircp_subset]

In [None]:
# Create a column with cycle + tircp for previous CT nominations
tircp2["previous_caltrans_nominations"] = (
    "TIRCP" + " Cycle " + tircp2["award_cycle"].astype("str")
)

In [None]:
# Create column for unmet needs
tircp2["total_unfunded_need__$1,000"] = tircp2["total__cost"] - tircp2["tircp"]

In [None]:
# Perhaps to narrow down projects
# Figure out which TIRCP projects' total cost are completely covered by TIRCP requested
tircp2["total_cost_vs_tircp_req"] = tircp2["tircp"] / tircp2["total__cost"]

In [None]:
# tircp2.total_cost_vs_tircp_req.value_counts()

In [None]:
# Filter out projects in which total cost equals total award
tircp2 = (tircp2.loc[tircp2.total_cost_vs_tircp_req != 1]).reset_index(drop=True)

In [None]:
# Drop some columns
tircp_cols_drop = ["tircp", "total_cost_vs_tircp_req", "award_cycle"]

In [None]:
tircp2 = tircp2.drop(columns=tircp_cols_drop)

In [None]:
tircp2.shape

#### Get current phase & amounts.

In [None]:
allocation = to_snakecase(
    pd.read_excel(
        f"gs://calitp-analytics-data/data-analyses/tircp/clean_tircp.xlsx",
        sheet_name="clean_allocation",
    )
)

In [None]:
# allocation.isna().sum()

In [None]:
# Subset - date is allocation date and has the least # of missing values in the date field
allocation2 = allocation[["award_year", "ppno", "phase", "date"]]

In [None]:
# Find the most recent allocation date & phase.
allocation3 = (
    allocation2.groupby(["award_year", "ppno", "phase"]).agg({"date": "max"})
).reset_index()

In [None]:
# Only keep the most recent allocation date & phase.
allocation3 = allocation3.sort_values(
    ["ppno", "date"], ascending=[True, False]
).drop_duplicates("ppno")

In [None]:
# Drop date
allocation3 = allocation3.drop(columns=["date"])

In [None]:
# Merge
tircp2 = pd.merge(
    tircp2,
    allocation3,
    how="left",
    on=["ppno", "award_year"],
)

In [None]:
len(tircp2)

#### Harmonize the way columns are named/formatted.
* TIRCP: `Total_cost` and `unmet needs` should be floats and divided by 1000. `District` should be int64. 

In [None]:
atp_shopp2.previous_caltrans_nominations.nunique(), len(atp_shopp2)

In [None]:
# Find projects that have DRMT in previous caltrans nominations
# The CT nominations column has been casted to lower case. 
atp_shopp_drmt = (
    atp_shopp2[atp_shopp2["previous_caltrans_nominations"].str.contains(("tircp|drmt"))]
).reset_index(drop=True)

In [None]:
# Check out that the filtering was correct.
# atp_shopp_drmt.previous_caltrans_nominations.value_counts()

In [None]:
# Natalie's function
def align_funding_numbers(df, list_of_cols):
    for col in list_of_cols:
        df[col] = df[col] / 1000

    return df

In [None]:
tircp2 = align_funding_numbers(
    tircp2,
    [
        "total_unfunded_need__$1,000",
        "total__cost",
    ],
)

In [None]:
# (tircp2[["total_unfunded_need__$1,000"]]/1000).astype('int64')*1000

In [None]:
tircp2[["total_unfunded_need__$1,000", "total__cost",]] = tircp2[
    [
        "total_unfunded_need__$1,000",
        "total__cost",
    ]
].fillna(0)

In [None]:
tircp2["district_full_name"] = tircp2["district"]

In [None]:
# Extract digit from district. Fill "Various" districts with "0" and change to int64
tircp2["district"] = tircp2["district"].str.extract("(\d+)").fillna(0).astype("int64")

In [None]:
# Rename columns
tircp_new_cols = {
    "grant_recipient": "lead_agency",
    "title": "project_name",
    "county": "full_county_name",
    "description": "project_description",
    "total__cost": "total_project_cost__$1,000",
    "on_shs?": "shs_capacity_increase_detail",
    "comments_additional_contacts": "notes",
    "phase": "current_phase",
}

In [None]:
# atp_shopp_drmt.info(), tircp2.info()

In [None]:
tircp2 = tircp2.rename(columns=tircp_new_cols)

In [None]:
# Drop columns
tircp2 = tircp2.drop(columns=["award_year"])

In [None]:
# Add column for primary mode.
tircp2["primary_mode"] = "Transit/Zev/Rail (Passenger)"

In [None]:
district_replace = {
    "District 7: Los Angeles": "07 - Los Angeles",
    "District 4: Bay Area / Oakland": "04 - Oakland",
    "District 5: San Luis Obispo / Santa Barbara": "05 - San Luis Obispo",
    "District 12: Orange County": "12 - Irvine",
    "District 3: Marysville / Sacramento": "03 - Marysville",
    "District 11: San Diego": "11 - San Diego",
    "District 10: Stockton": "10 - Stockton",
    "District 6: Fresno / Bakersfield": "06 - Fresno",
    "District 8: San Bernardino / Riverside": "08 - San Bernardino",
    "District 2:Redding": "02 - Redding",
    "District 1: Eureka": "01 - Eureka",
}

In [None]:
# Replace district name to match
tircp2.district = tircp2.district.replace(district_replace)

### Find TIRCP projects that are already in the 10 year Non SHOPP. 

#### Test with Merges
* Zero merges across the board. 

In [None]:
# Make a copy of tircp
# tircp_test = tircp2.copy()

In [None]:
# Round numbers
# tircp_test["total_unfunded_need__$1,000"] = (tircp_test[["total_unfunded_need__$1,000"]]/1000).astype('int64')*1000

In [None]:
# tircp_test["total_project_cost__$1,000"] = (tircp_test[["total_project_cost__$1,000"]]/1000).astype('int64')*1000

In [None]:
# atp_shopp_drmt["total_project_cost__$1,000"] = ((atp_shopp_drmt[["total_project_cost__$1,000"]]/1000).astype('int64')*1000).astype('int64')
# atp_shopp_drmt["total_project_cost__$1,000"] = ((atp_shopp_drmt[["total_project_cost__$1,000"]]/1000).astype('int64')*1000).astype('int64')

In [None]:
# Merge on district and unfunded needs
# test1 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["district", "total_unfunded_need__$1,000"], indicator = True)

In [None]:
# test1._merge.value_counts()

In [None]:
# Merge on county and unfunded needs
# test2 = pd.merge(atp_shopp_drmt, tircp2, how= "left", on=["full_county_name", "total_unfunded_need__$1,000"], indicator = True, suffixes = ["_shopp", "_tircp"])

In [None]:
# test2._merge.value_counts()

In [None]:
# test2.loc[test2._merge == "both"][["project_name_shopp","project_name_tircp", "full_county_name", "project_description_shopp","project_description_tircp"]]

In [None]:
# Merge on district and total costs
# test2 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["district", "total_project_cost__$1,000"], indicator = True)

#### Observation: 
* Total Project Cost, project titles, lead agency, and districts are potentially listed differently across datasets.
* Inglewood Transit Connector Project	  is listed as $1,016,000.000 in TIRCP but 1,666,466.0 in non SHOPP.
    * City Of Inglewood is the grant recipient in TIRCP but Caltrans is the lead agency in Non SHOPP.
* Valley Rail Expansion: Altamont Corridor Express (ACE) Ceres to Turlock Extension (the TIRCP title) is listed as D6 in TIRCP but D10 in Stanislaus.
    * Fresno Subdivision (Ceres To Turlock) Double Tracking is the non SHOPP title.

In [None]:
tircp_already_entered = [
    "Inglewood Transit Connector Project",
    "Valley Rail Expansion: Altamont Corridor Express (ACE) Ceres to Turlock Extension",
]

In [None]:
tircp.loc[tircp.title.isin(tircp_already_entered)][['district','description']]

In [None]:
# Projects with only TIRCP in the previous nomination.
atp_shopp_drmt[atp_shopp_drmt["previous_caltrans_nominations"].str.contains(("tircp"))][['project_name','district','project_description','primary_mode','previous_caltrans_nominations']]

In [None]:
# Delete TIRCP projects that are already in non SHOPP
# One project below in non SHOPP mentions TIRCP but that project
# was ultimately not nominated.
tircp2 = (tircp2[~tircp2["project_name"].isin(tircp_already_entered)]).reset_index(
    drop=True
)

In [None]:
len(tircp2)

In [None]:
# atp_shopp_drmt[atp_shopp_drmt["previous_caltrans_nominations"].str.contains(("tircp"))].drop(columns = "notes")

#### Test with Project Names

In [None]:
# tircp2["project_test"] = tircp2["project_name"]

In [None]:
# atp_shopp_drmt["project_test"] = atp_shopp_drmt["project_name"]

In [None]:
# tircp2 = simplify_project_names(tircp2, "project_test")

In [None]:
# atp_shopp_drmt = simplify_project_names(atp_shopp_drmt, "project_test")

In [None]:
# atp_shopp_drmt[["project_name","project_test"]].sort_values(by = ["project_name"])

In [None]:
# Merge on project names
# test3 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["project_test"], indicator = True, suffixes = ["_shopp", "_tircp"] )

In [None]:
# test3._merge.value_counts()

In [None]:
# atp_shopp_drmt[["project_test"]].sort_values("project_test")

In [None]:
# tircp2[["project_test"]].sort_values("project_test")

#### Test 1 with Fuzzy Matching on Project Names
* Unsuccessful with project names. Only 2 matches and after looking at descriptions/districts manually, they aren't similar enough.

In [None]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [None]:
# Create a list of all the TIRCP names
# tircp_projects = tircp2.project_name.unique().tolist()

In [None]:
# atp_shopp_projects = atp_shopp_drmt.project_name.unique().tolist()

In [None]:
# for i in atp_shopp_projects:
#    replace_matches_in_column(
#        tircp2, "project_name", "project_name_fuzzy_match", i, 80
#    )

In [None]:
# tircp2[["project_description", "project_description","project_description_fuzzy_match"]]

In [None]:
# Merge on project names
# test4 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", left_on=["project_test"], right_on = ["project_name_fuzzy_match"], indicator = True, suffixes = ["_shopp", "_tircp"] )

In [None]:
# test4._merge.value_counts()

In [None]:
# test4.loc[test4._merge == "both"][["district_shopp","district_tircp","project_name_tircp","project_name_shopp", "project_description_shopp", "project_description_tircp"]]

### Concat & Clean
* After only finding 2 projects that are already entered in, concat TIRCP information. 
    * Already filtered out those 2 projects in TIRCP.
    * Also filtered out any projects where TIRCP amount equals Total Project Cost, since presumbly they don't need any more $.
* Concat with original ATP SHOPP.

In [None]:
atp_drop = ["unnamed:_0", "update_complete",  'merge',
 'detailed_project_title',]

In [None]:
atp_shopp = atp_shopp.drop(columns=atp_drop)

In [None]:
concat1 = pd.concat([atp_shopp, tircp2])

In [None]:
# Fill NA based on dtypes
concat1 = concat1.fillna(concat1.dtypes.replace({"float64": 0.0, "object": "None"}))

In [None]:
# concat1.info(verbose=True)

In [None]:
# concat1.tail(1)

#### Add Detailed District Name

In [None]:
concat1['detailed_project_title'] =  ('District '+ concat1.district.astype('str') + '-' + concat1.project_name)

#### Clean Up
* Best to look at full district & county names
* Replace county "None" with "full county name" values. 

In [None]:
smartsheet = concat1.copy()

In [None]:
smartsheet = _utils.clean_up_columns(smartsheet)

In [None]:
smartsheet['County'] = np.where(smartsheet['County'] == "None", smartsheet['Full County Name'], smartsheet['County'])

In [None]:
smartsheet.sample()

In [None]:
smartsheet.to_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")

#### FOR TABLEAU  - Add Fake Values

In [None]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [None]:
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_transportation",
]

In [None]:
# Add fake metric columns: I want a random value between 0 to 50.
for i in fake_columns:
    concat1[i] = np.round(np.random.uniform(0.0, 50.0, size=(len(concat1), 1)), 2)

In [None]:
# list(concat1.columns)

#### Create fake benefit score and ranks.

In [None]:
# Create the fake benefit score based off of Virginia DOT.
concat1["fake_benefit_score"] = (
    (concat1[fake_columns].sum(axis=1))
    / (concat1["total_unfunded_need__$1,000"])
    .apply(pd.to_numeric, errors="coerce")
    .fillna(0)
    .sum()
) * 1000000

In [None]:
# Create a fake statewide project rank
concat1["statewide_rank"] = concat1["fake_benefit_score"].rank(ascending=False)

In [None]:
# Create fake project rank by district
concat1["district_rank"] = concat1.groupby("district")["fake_benefit_score"].rank(
    method="dense", ascending=False
)

In [None]:
# Assign percentile of project among all projects in the state.
concat1 = _utils.project_size_rating(
    concat1, "statewide_rank", "fake_benefit_score_statewide_percentile"
)

In [None]:
# Make sure this makes sense
# concat1[['district_rank','statewide_rank','fake_benefit_score','fake_benefit_score_statewide_percentile']].sort_values('fake_benefit_score', ascending= False)

In [None]:
# D4 only projects
len(concat1.loc[concat1["district"] == 4])

In [None]:
# list(concat1.columns)

#### Add median across districts
Suggestion from Nick to add medians across the district. 
* Project cost
* unfunded needs
* benefit score

In [None]:
# Grab medians
summary_district_state = (
    concat1.groupby(["district"])
    .agg(
        {
            "total_unfunded_need__$1,000": "median",
            "fake_benefit_score": "median",
            "total_project_cost__$1,000": "median",
            "csis_total_score__out_of_45": "median",
            "atp_total_score__out_of_100": "median",
        }
    )
    .reset_index()
)

In [None]:
# Add suffixes
summary_district_state = summary_district_state.add_suffix("_district_median_")

In [None]:
concat1 = pd.merge(
    concat1,
    summary_district_state,
    left_on="district",
    right_on="district_district_median_",
    how="left",
)

In [None]:
concat1 = _utils.clean_up_columns(concat1)

In [None]:
concat1.to_excel(f"{_utils.GCS_FILE_PATH}tableau.xlsx")