## Add TIRCP

In [1]:
import _utils
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 200
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# import fuzzywuzzy
# from fuzzywuzzy import process

### ATP - 10 Year Non SHOPP Prep

In [4]:
# Read in 10 Year non SHOPP
atp_shopp = to_snakecase(
    pd.read_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_with_fake_metrics_plus_atp.xlsx")
)

In [5]:
# list(atp_shopp.columns)

In [6]:
# Subset
atp_shopp_subset = [
    "ppno",
    "project_name",
    "lead_agency",
    "previous_caltrans_nominations",
    "full_county_name",
    "district",
    "project_description",
    "current_phase",
    "primary_mode",
    "urban_rural",
    "total_project_cost__$1,000",
    "total_unfunded_need__$1,000",
    "notes",
    "shs_capacity_increase_detail",
    "current_phase",
]

In [7]:
# Column for testing stuff
atp_shopp2 = atp_shopp[atp_shopp_subset]

In [8]:
# atp_shopp2.sample()

In [9]:
# Function to clean agency/organization names
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip()  # strip again after getting rid of certain things
    )
    return df

In [10]:
# Lowercase previous caltrans nominations
atp_shopp2.previous_caltrans_nominations = (
    atp_shopp2.previous_caltrans_nominations.str.lower()
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
# Clean ATP
atp_shopp2 = organization_cleaning(atp_shopp2, "lead_agency")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [12]:
# atp_shopp2.head(100)

### TIRCP Prep
* Filter out projects in which total cost equals total award 

In [13]:
# Read in TIRCP. Last updated November 2022.
tircp = to_snakecase(
    pd.read_excel(
        "gs://calitp-analytics-data/data-analyses/tircp/Tableau_Workbook.xlsx"
    )
)

In [14]:
# tircp.columns

In [15]:
# Subset TIRCP with only the basic information
tircp_subset = [
    "award_year",
    "grant_recipient",
    "title",
    "ppno",
    "district",
    "county",
    "description",
    "total__cost",
    "tircp",
    "award_cycle",
    "on_shs?",
    "comments_additional_contacts",
]

In [16]:
tircp2 = tircp[tircp_subset]

In [17]:
# Create a column with cycle + tircp for previous CT nominations
tircp2["previous_caltrans_nominations"] = (
    "TIRCP" + " Cycle " + tircp2["award_cycle"].astype("str")
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [18]:
# Create column for unmet needs
tircp2["total_unfunded_need__$1,000"] = tircp2["total__cost"] - tircp2["tircp"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
# Perhaps to narrow down projects
# Figure out which TIRCP projects' total cost are completely covered by TIRCP requested
tircp2["total_cost_vs_tircp_req"] = tircp2["tircp"] / tircp2["total__cost"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
# tircp2.total_cost_vs_tircp_req.value_counts()

In [21]:
# Filter out projects in which total cost equals total award
tircp2 = (tircp2.loc[tircp2.total_cost_vs_tircp_req != 1]).reset_index(drop=True)

In [22]:
# Drop some columns
tircp_cols_drop = ["tircp", "total_cost_vs_tircp_req", "award_cycle"]

In [23]:
tircp2 = tircp2.drop(columns=tircp_cols_drop)

In [24]:
tircp2.shape

(93, 12)

#### Get current phase & amounts.

In [25]:
allocation = to_snakecase(
    pd.read_excel(
        f"gs://calitp-analytics-data/data-analyses/tircp/clean_tircp.xlsx",
        sheet_name="clean_allocation",
    )
)



In [26]:
# allocation.isna().sum()

In [27]:
# Subset - date is allocation date and has the least # of missing values in the date field
allocation2 = allocation[["award_year", "ppno", "phase", "date"]]

In [28]:
# Find the most recent allocation date & phase.
allocation3 = (
    allocation2.groupby(["award_year", "ppno", "phase"]).agg({"date": "max"})
).reset_index()

In [29]:
# Only keep the most recent allocation date & phase.
allocation3 = allocation3.sort_values(
    ["ppno", "date"], ascending=[True, False]
).drop_duplicates("ppno")

In [30]:
# Drop date
allocation3 = allocation3.drop(columns=["date"])

In [31]:
# Merge
tircp2 = pd.merge(
    tircp2,
    allocation3,
    how="left",
    on=["ppno", "award_year"],
)

In [32]:
len(tircp2)

93

#### Harmonize the way columns are named/formatted.
* TIRCP: `Total_cost` and `unmet needs` should be floats and divided by 1000. `District` should be int64. 

In [33]:
atp_shopp2.previous_caltrans_nominations.nunique(), len(atp_shopp2)

(242, 814)

In [34]:
# Find projects that have DRMT in previous caltrans nominations
# The CT nominations column has been casted to lower case. 
atp_shopp_drmt = (
    atp_shopp2[atp_shopp2["previous_caltrans_nominations"].str.contains(("tircp|drmt"))]
).reset_index(drop=True)

In [35]:
# Check out that the filtering was correct.
# atp_shopp_drmt.previous_caltrans_nominations.value_counts()

In [36]:
# Natalie's function
def align_funding_numbers(df, list_of_cols):
    for col in list_of_cols:
        df[col] = df[col] / 1000

    return df

In [37]:
tircp2 = align_funding_numbers(
    tircp2,
    [
        "total_unfunded_need__$1,000",
        "total__cost",
    ],
)

In [38]:
# (tircp2[["total_unfunded_need__$1,000"]]/1000).astype('int64')*1000

In [39]:
tircp2[["total_unfunded_need__$1,000", "total__cost",]] = tircp2[
    [
        "total_unfunded_need__$1,000",
        "total__cost",
    ]
].fillna(0)

In [40]:
tircp2["district_full_name"] = tircp2["district"]

In [41]:
# Extract digit from district. Fill "Various" districts with "0" and change to int64
tircp2["district"] = tircp2["district"].str.extract("(\d+)").fillna(0).astype("int64")

In [42]:
# Rename columns
tircp_new_cols = {
    "grant_recipient": "lead_agency",
    "title": "project_name",
    "county": "full_county_name",
    "description": "project_description",
    "total__cost": "total_project_cost__$1,000",
    "on_shs?": "shs_capacity_increase_detail",
    "comments_additional_contacts": "notes",
    "phase": "current_phase",
}

In [43]:
# atp_shopp_drmt.info(), tircp2.info()

In [44]:
tircp2 = tircp2.rename(columns=tircp_new_cols)

In [45]:
# Drop columns
tircp2 = tircp2.drop(columns=["award_year"])

In [46]:
# Add column for primary mode.
tircp2["primary_mode"] = "Transit/Zev/Rail (Passenger)"

In [48]:
district_replace = {
    "District 7: Los Angeles": "07 - Los Angeles",
    "District 4: Bay Area / Oakland": "04 - Oakland",
    "District 5: San Luis Obispo / Santa Barbara": "05 - San Luis Obispo",
    "District 12: Orange County": "12 - Irvine",
    "District 3: Marysville / Sacramento": "03 - Marysville",
    "District 11: San Diego": "11 - San Diego",
    "District 10: Stockton": "10 - Stockton",
    "District 6: Fresno / Bakersfield": "06 - Fresno",
    "District 8: San Bernardino / Riverside": "08 - San Bernardino",
    "District 2:Redding": "02 - Redding",
    "District 1: Eureka": "01 - Eureka",
}

In [49]:
# Replace district name to match
tircp2.district = tircp2.district.replace(district_replace)

### Find TIRCP projects that are already in the 10 year Non SHOPP. 

#### Test with Merges
* Zero merges across the board. 

In [50]:
# Make a copy of tircp
# tircp_test = tircp2.copy()

In [51]:
# Round numbers
# tircp_test["total_unfunded_need__$1,000"] = (tircp_test[["total_unfunded_need__$1,000"]]/1000).astype('int64')*1000

In [52]:
# tircp_test["total_project_cost__$1,000"] = (tircp_test[["total_project_cost__$1,000"]]/1000).astype('int64')*1000

In [53]:
# atp_shopp_drmt["total_project_cost__$1,000"] = ((atp_shopp_drmt[["total_project_cost__$1,000"]]/1000).astype('int64')*1000).astype('int64')
# atp_shopp_drmt["total_project_cost__$1,000"] = ((atp_shopp_drmt[["total_project_cost__$1,000"]]/1000).astype('int64')*1000).astype('int64')

In [54]:
# Merge on district and unfunded needs
# test1 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["district", "total_unfunded_need__$1,000"], indicator = True)

In [55]:
# test1._merge.value_counts()

In [56]:
# Merge on county and unfunded needs
# test2 = pd.merge(atp_shopp_drmt, tircp2, how= "left", on=["full_county_name", "total_unfunded_need__$1,000"], indicator = True, suffixes = ["_shopp", "_tircp"])

In [57]:
# test2._merge.value_counts()

In [58]:
# test2.loc[test2._merge == "both"][["project_name_shopp","project_name_tircp", "full_county_name", "project_description_shopp","project_description_tircp"]]

In [59]:
# Merge on district and total costs
# test2 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["district", "total_project_cost__$1,000"], indicator = True)

#### Observation: 
* Total Project Cost, project titles, lead agency, and districts are potentially listed differently across datasets.
* Inglewood Transit Connector Project	  is listed as $1,016,000.000 in TIRCP but 1,666,466.0 in non SHOPP.
    * City Of Inglewood is the grant recipient in TIRCP but Caltrans is the lead agency in Non SHOPP.
* Valley Rail Expansion: Altamont Corridor Express (ACE) Ceres to Turlock Extension (the TIRCP title) is listed as D6 in TIRCP but D10 in Stanislaus.
    * Fresno Subdivision (Ceres To Turlock) Double Tracking is the non SHOPP title.

In [60]:
tircp_already_entered = [
    "Inglewood Transit Connector Project",
    "Valley Rail Expansion: Altamont Corridor Express (ACE) Ceres to Turlock Extension",
]

In [61]:
tircp.loc[tircp.title.isin(tircp_already_entered)][['district','description']]

Unnamed: 0,district,description
59,District 7: Los Angeles,"Construction of a 1.6-mile electrically powered automated people mover (APM) system and three new stations in the City of Inglewood. The project will create a new connection for passengers directly from the LA Metro Crenshaw/LAX Line’s Downtown Inglewood Station to new housing and employment centers, and regionally serving sports and entertainment including the Los Angeles Sports and Entertainment District (LASED) at Hollywood Park/SoFi Stadium and the proposed Inglewood Basketball and Entertainment Center (IBEC) Project. The project will connect the City of Inglewood’s high growth areas with LA Metro’s regional rail system."
91,District 6: Fresno / Bakersfield,"Extends ACE from Ceres to Turlock, which is an interim phase of the Ceres to Merced extension. Includes a new Turlock Station and layover track, and provides a direct connection with Turlock Transit"


In [62]:
# Projects with only TIRCP in the previous nomination.
atp_shopp_drmt[atp_shopp_drmt["previous_caltrans_nominations"].str.contains(("tircp"))][['project_name','district','project_description','primary_mode','previous_caltrans_nominations']]

Unnamed: 0,project_name,district,project_description,primary_mode,previous_caltrans_nominations
4,"Los Angeles Metro Light Rail Capital, Operational And Rehabilitation Enhancements (Core) Capacity & System Integration Project",7,"The project will eliminate a two-car capacity constraint on the integrated Crenshaw/LAX and Green (C) light rail lines, which operate within the congested I-405 and I-105 corridors and the area around Los Angeles International Airport (LAX), by:\n1) Extending the aerial platforms at four Green Line stations (Redondo Beach, Douglas, Mariposa, and Aviation/LAX)\n2) State of good repair work and station improvements at all of those four stations plus El Segundo Station\n3) Adding two new traction power substations (TPSS) on the Crenshaw/LAX Line in the cities of Inglewood and Los Angeles",Rail (Passenger),sccp cycle 3 priority 13 of 14\nnot awarded tircp
6,Inglewood Transit Connector (Itc) Project,7,"The Project consists of an approximately 1.6-mile fully elevated transit system in the City of Inglewood with three transit station connecting passengers from the Metro Crenshaw/LAX (K) Line to the City's new housing and employment centers and sports and entertainment venues. The Project is primarily located along Market Street, Manchester Boulevard and Prairie Avenue and includes a fully elevated guideway and supporting infrastructure, automated trains, a maintenance and storage facility, power distribution substations, and new public parking lots.",Transit,tircp award was downscaled\nsccp cycle 3 priority 05 of 14\nawarded raise 2022 (not caltrans)
7,Fresno Subdivision (Ceres To Turlock) Double Tracking,10,"Construct double tracking of 8.1 miles of existing single-track UPRR mainline rail corridor to alleviate congestion, increase capacity, improve safety, and allow for more efficient freight rail movement. A secondary benefit is the project is also necessary for the extension of Altamont Corridor Express (ACE) passenger rail services from Ceres to Turlock.",Rail (Freight),tircp 2021 application - pending award\ninfra 2022 application - pending award\ntcep cycle 3 priority 04 of 24\nmpdg los 2022 signed


In [63]:
# Delete TIRCP projects that are already in non SHOPP
# One project below in non SHOPP mentions TIRCP but that project
# was ultimately not nominated.
tircp2 = (tircp2[~tircp2["project_name"].isin(tircp_already_entered)]).reset_index(
    drop=True
)

In [64]:
len(tircp2)

91

In [65]:
# atp_shopp_drmt[atp_shopp_drmt["previous_caltrans_nominations"].str.contains(("tircp"))].drop(columns = "notes")

#### Test with Project Names

In [66]:
# tircp2["project_test"] = tircp2["project_name"]

In [67]:
# atp_shopp_drmt["project_test"] = atp_shopp_drmt["project_name"]

In [69]:
# tircp2 = simplify_project_names(tircp2, "project_test")

In [70]:
# atp_shopp_drmt = simplify_project_names(atp_shopp_drmt, "project_test")

In [71]:
# atp_shopp_drmt[["project_name","project_test"]].sort_values(by = ["project_name"])

In [72]:
# Merge on project names
# test3 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", on=["project_test"], indicator = True, suffixes = ["_shopp", "_tircp"] )

In [73]:
# test3._merge.value_counts()

In [74]:
# atp_shopp_drmt[["project_test"]].sort_values("project_test")

In [75]:
# tircp2[["project_test"]].sort_values("project_test")

#### Test 1 with Fuzzy Matching on Project Names
* Unsuccessful with project names. Only 2 matches and after looking at descriptions/districts manually, they aren't similar enough.

In [76]:
# Replace all rows in agency column with a min ratio with  "string_to_match value"
def replace_matches_in_column(df, column, new_col_name, string_to_match, min_ratio):
    # Get a list of unique strings
    strings = df[column].unique()

    # Get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(
        string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio
    )

    # Only get matches with a  min ratio
    close_matches = [matches[0] for matches in matches if matches[1] > min_ratio]

    # Get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches
    df.loc[rows_with_matches, new_col_name] = string_to_match

In [77]:
# Create a list of all the TIRCP names
# tircp_projects = tircp2.project_name.unique().tolist()

In [78]:
# atp_shopp_projects = atp_shopp_drmt.project_name.unique().tolist()

In [79]:
# for i in atp_shopp_projects:
#    replace_matches_in_column(
#        tircp2, "project_name", "project_name_fuzzy_match", i, 80
#    )

In [80]:
# tircp2[["project_description", "project_description","project_description_fuzzy_match"]]

In [81]:
# Merge on project names
# test4 = pd.merge(atp_shopp_drmt, tircp2, how= "outer", left_on=["project_test"], right_on = ["project_name_fuzzy_match"], indicator = True, suffixes = ["_shopp", "_tircp"] )

In [82]:
# test4._merge.value_counts()

In [83]:
# test4.loc[test4._merge == "both"][["district_shopp","district_tircp","project_name_tircp","project_name_shopp", "project_description_shopp", "project_description_tircp"]]

### Concat & Clean
* After only finding 2 projects that are already entered in, concat TIRCP information. 
    * Already filtered out those 2 projects in TIRCP.
    * Also filtered out any projects where TIRCP amount equals Total Project Cost, since presumbly they don't need any more $.
* Concat with original ATP SHOPP.

In [84]:
atp_drop = ["unnamed:_0", "update_complete",  'merge',
 'detailed_project_title',]

In [85]:
atp_shopp = atp_shopp.drop(columns=atp_drop)

In [86]:
concat1 = pd.concat([atp_shopp, tircp2])

In [87]:
# Fill NA based on dtypes
concat1 = concat1.fillna(concat1.dtypes.replace({"float64": 0.0, "object": "None"}))

In [88]:
# concat1.info(verbose=True)

In [89]:
# concat1.tail(1)

#### Add Detailed District Name

In [90]:
concat1['detailed_project_title'] =  ('District '+ concat1.district.astype('str') + '-' + concat1.project_name)

#### Clean Up
* Best to look at full district & county names
* Replace county "None" with "full county name" values. 

In [91]:
smartsheet = concat1.copy()

In [92]:
smartsheet = _utils.clean_up_columns(smartsheet)

In [93]:
smartsheet['County'] = np.where(smartsheet['County'] == "None", smartsheet['Full County Name'], smartsheet['County'])

In [94]:
smartsheet.sample()

Unnamed: 0,Rail Project Id,10 Year Plan,Ct Project Id,Ea,Ppno,Project Name,District,County,Route,Project Description,Current Phase,Con Existing Source S Of Funds,Con Anticipated Source Of Funds,Target Opening Year,Beg Pm,End Pm,Primary Mode,Previous Caltrans Nominations,Urban Rural,Notes,Lead Agency,Pid Approval Date M010,Target Pa Ed M200,Rtl Date M460,Con Start Date M500,Funding Need Phase S,"Pa Ed Cost $1,000","Ps E Cost $1,000","Row Cost $1,000","Con Support Cost $1,000","Non Infrastructure Plan Cost $1,000","Total Unfunded Need $1,000",Previous Funding Request Phase,Last Scored,Csis Alignment,Csis Total Score Out Of 45,Mode Shift Csis Score,Mode Shift Csis Comment,Vmt Csis Score,Vmt Csis Comment,Public Engagement Csis Score,Public Engagement Csis Comment,Dac Local Community Needs Csis Score,Dac Local Community Needs Csis Comment,Safety Csis Score,Safety Csis Comment,Zev Csis Score,Zev Csis Comment,Climate Resiliency Csis Score,Climate Resiliency Csis Comment,Natural Resources And Ecosystems Csis Score,Natural Resources And Ecosystems Csis Comment,Infill Development And Land Use Csis Score,Infill Development And Land Use Csis Comment,Benefits To Dac And Advancing Equity Atp Score,Community Need Atp Score,Safety Atp Score,Public Participation Atp Score,Community Feedback Atp Score,Continued Engagement Atp Score,Context Sensitive And Innovation Atp Score,Transformative Atp Score,Atp Total Score Out Of 100,Atp Alignment,Access Alignment,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,Previous Funding Request,Purpose Need,Parcel Counts,"Total Project Cost $1,000","Con Capital Cost $1,000",Hq Priority,District Priority,Potential Funding Program S,Located In Dac,Shs Capacity Increase Detail,Secondary Mode S,Full County Name,Abbrev,District Full Name,Ppno1,"Total Project Cost $1,000 1","Pa Ed Cost $1,000 1","Ps E Cost $1,000 1","Non Infrastructure Plan Cost $1,000 1",Detailed Project Title
24,,0.0,120000130,0K660,3204,Lucerne Complete Streets,1,LAK,20,"Develop complete streets improvements to connect Lucerne's downtown to their waterfront. Including: sidewalk, crosswalks, buffered bike lanes, flashing beacons bulbouts, two pedestrian bridges.",PID,,,,16.74,18.02,Complete Streets,FY 21-22 Non-SHOPP PID Nomination (Carryover),Rural,PID Completed,Caltrans,datetime64[ns],,datetime64[ns],datetime64[ns],0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,Not Well-Aligned,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Not Well-Aligned,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,31268.0,0.0,0.0,0.0,Atp\nActive Transportation Infrastructure Investment Program\nPid Non-Shopp\nRaise\nRural,,,Bike/Pedestrian\nBridge,Lake,LAK,01 - Eureka,,0.0,0.0,0.0,0.0,District 1-Lucerne Complete Streets


In [95]:
smartsheet.to_excel(f"{_utils.GCS_FILE_PATH}cleaned_data_atp_tircp.xlsx")

#### FOR TABLEAU  - Add Fake Values

In [96]:
# https://stackoverflow.com/questions/64093880/how-to-create-random-floats-and-add-them-as-a-dataframe-column
np.random.seed(365)

In [97]:
fake_columns = [
    "increase_peak_person_throughput",
    "reduction_in_peak_period_delay",
    "reduction_in_fatal_and_injury_crashes",
    "reduction_in_injury_rates",
    "increase_access_to_jobs",
    "increase_access_jobs_to_DAC",
    "commercial_dev_developed",
    "tons_of_goods_impacted",
    "improve_air_quality",
    "impact_natural_resources",
    "support_of_transportation",
]

In [98]:
# Add fake metric columns: I want a random value between 0 to 50.
for i in fake_columns:
    concat1[i] = np.round(np.random.uniform(0.0, 50.0, size=(len(concat1), 1)), 2)

In [99]:
# list(concat1.columns)

#### Create fake benefit score and ranks.

In [100]:
# Create the fake benefit score based off of Virginia DOT.
concat1["fake_benefit_score"] = (
    (concat1[fake_columns].sum(axis=1))
    / (concat1["total_unfunded_need__$1,000"])
    .apply(pd.to_numeric, errors="coerce")
    .fillna(0)
    .sum()
) * 1000000

In [101]:
# Create a fake statewide project rank
concat1["statewide_rank"] = concat1["fake_benefit_score"].rank(ascending=False)

In [102]:
# Create fake project rank by district
concat1["district_rank"] = concat1.groupby("district")["fake_benefit_score"].rank(
    method="dense", ascending=False
)

In [103]:
# Assign percentile of project among all projects in the state.
concat1 = _utils.project_size_rating(
    concat1, "statewide_rank", "fake_benefit_score_statewide_percentile"
)

In [104]:
# Make sure this makes sense
# concat1[['district_rank','statewide_rank','fake_benefit_score','fake_benefit_score_statewide_percentile']].sort_values('fake_benefit_score', ascending= False)

In [105]:
# D4 only projects
len(concat1.loc[concat1["district"] == 4])

120

In [106]:
# list(concat1.columns)

#### Add median across districts
Suggestion from Nick to add medians across the district. 
* Project cost
* unfunded needs
* benefit score

In [107]:
# Grab medians
summary_district_state = (
    concat1.groupby(["district"])
    .agg(
        {
            "total_unfunded_need__$1,000": "median",
            "fake_benefit_score": "median",
            "total_project_cost__$1,000": "median",
            "csis_total_score__out_of_45": "median",
            "atp_total_score__out_of_100": "median",
        }
    )
    .reset_index()
)

In [108]:
# Add suffixes
summary_district_state = summary_district_state.add_suffix("_district_median_")

In [109]:
concat1 = pd.merge(
    concat1,
    summary_district_state,
    left_on="district",
    right_on="district_district_median_",
    how="left",
)

In [110]:
concat1 = _utils.clean_up_columns(concat1)

In [111]:
concat1.to_excel(f"{_utils.GCS_FILE_PATH}tableau.xlsx")