## General function to clean up data from various grants

In [1]:
import _utils
import _state_rail_plan_utils as srp_utils
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
state_rail_plan = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file)

In [4]:
state_rail_plan.shape

(276, 11)

In [5]:
def harmonizing(df, 
                agency_name: str,
                project_name:str,
                project_description:str,
                project_cost:str,
                location:str,
                program:str,
                cost_in_millions:bool = True):
    
    # Rename columns
    rename_columns = {agency_name: 'lead_agency',
                      project_name: 'project_title',
                      project_description: 'project_description',
                      project_cost: 'total_project_cost',
                      location: 'project_location'}
    
    df = df.rename(columns = rename_columns)
    
    # Coerce cost/fund columns to right type
    # Find any columns that contains cost or fund in it.
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    
    for i in cost_columns:
        df[i]= df[i].apply(pd.to_numeric, errors = 'coerce').fillna(0)
    
    # Clean up string columns
    string_cols = df.select_dtypes(include=['object']).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.strip().str.title()
        
    # Fill in any nulls
    df = df.fillna(df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

    # Add data source
    df['data_source'] = program 
    
    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1000000)
    else:
        df
        
    return df

In [6]:
state_rail_plan_test = harmonizing(state_rail_plan, 'lead_agency', 'project_name','project_description','total_project_cost', 'corridor', 'State Rail Plan') 

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [7]:
state_rail_plan.sample()

Unnamed: 0,project_time_horizon,project_name,project_description,lead_agency,total_project_cost,srp_region,corridor,sub_corridor_node_1,sub_corridor_node_2,itsp_corridor,project_category
10,Mid Term,Roadway Grade Separations: Sankey Road/UPRR Sacramento Sub,Roadway grade separation at Sankey Road. This project supports a future extension of Valley Rail Service north of Sacramento towards Chico.,**Multiple,35000000,Northern California Megaregion,Sacramento Valley Corridor,Chico,Sacramento,Sacramento Valley - Oregon Border,Grade Separation


In [8]:
state_rail_plan_test.sample()

Unnamed: 0,project_time_horizon,project_title,project_description,lead_agency,total_project_cost,srp_region,project_location,sub_corridor_node_1,sub_corridor_node_2,itsp_corridor,project_category,data_source
108,Near Term,San Diego Convention Center Station,New Station At San Diego Convention Center. Tbd Regular Revenue Service Or Special Event Only.,Nctd,52.92,South Coast,Los Angeles To San Diego Corridor,Oceanside,San Diego,South Coast - Central Coast,Capital,State Rail Plan


In [9]:
lost = to_snakecase(pd.read_excel(f"{srp_utils.GCS_FILE_PATH}LOST/LOST_all_projects.xlsx", sheet_name = "Main"))

In [11]:
lost_test = harmonizing(lost, 'agency', 'project_title','project_description','cost__in_millions_', 'location', 'LOST') 

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()


In [15]:
lost.sample()

Unnamed: 0,measure,local_returns,project_category,location,project_title,cost__in_millions_,estimated_lost_funds,estimated_federal_funds,estimated_state_funds,estimated_local_funds,estimated_other_funds,project_description,agency,county,city,notes
1816,tulare_r_2006,No,,,"Visalia Signals (7) various locations PS&E,ROW,CON",1.4,0,0.0,0.0,0,0.0,"ps&e, row, con",,tulare,,


In [14]:
lost_test.sample()

Unnamed: 0,measure,local_returns,project_category,project_location,project_title,total_project_cost,estimated_lost_funds,estimated_federal_funds,estimated_state_funds,estimated_local_funds,estimated_other_funds,project_description,lead_agency,county,city,notes,data_source
1104,Sacramento_A2_2004,No,"Local Road Maintenance, Safety And Congestion Relief\nProgram",,Installing Fog/Rain/Night Roadway Reflectors,0.0,0.0,0.0,0.0,0.0,0.0,,,Sacramento,,No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.,LOST
