## General function to clean up data from various grants

In [1]:
import _utils
import _state_rail_plan_utils as srp_utils
import pandas as pd
from calitp_data_analysis.sql import to_snakecase

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
def load_state_rail_plan():
    df = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file)
    return df

In [4]:
def load_lost():
    df = to_snakecase(pd.read_excel(f"{srp_utils.GCS_FILE_PATH}LOST/LOST_all_projects.xlsx", sheet_name = "Main"))
    return df

In [5]:
srp = load_state_rail_plan()

In [6]:
lost = load_lost()

In [7]:
def funding_vs_expenses(df):
    """
    Determine if a project is fully funded or not
    """
    if df["total_project_cost"] == 0:
        return "No project cost info"
    elif df["total_available_funds"] == 0:
        return "No available funding info"
    elif df["total_available_funds"] == df["total_project_cost"]:
        return "Fully funded"
    else:
        return "Not fully funded"


In [20]:
def organization_cleaning(df, agency_col: str):
    df[agency_col] = (
        df[agency_col]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.split(";")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.replace("*","")
        .str.strip() #strip whitespaces again after getting rid of certain things
    )
    return df

In [21]:
def harmonizing(df, 
                agency_name: str,
                project_name:str,
                project_description:str,
                project_cost:str,
                location:str,
                county:str,
                city:str,
                program:str,
                fund_cols:list,
                cost_in_millions:bool = True):
    
    # Rename columns
    rename_columns = {agency_name: 'lead_agency',
                      project_name: 'project_title',
                      project_description: 'project_description',
                      project_cost: 'total_project_cost',
                      location: 'project_location',
                      county: 'project_county',
                      city: 'project_city'}
    
    df = df.rename(columns = rename_columns)
    
    # Coerce cost/fund columns to right type
    cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
    for i in cost_columns:
        df[i]= df[i].apply(pd.to_numeric, errors = 'coerce').fillna(0)
    
    # Clean up string columns
    string_cols = df.select_dtypes(include=['object']).columns.to_list()
    for i in string_cols:
        df[i] = df[i].str.strip().str.title()
        
    # Clean agency names
    df = organization_cleaning(df, 'lead_agency')
    
    # Add data source
    df['data_source'] = program 
    
    # Divide cost columns by millions
    # If bool is set to True
    if cost_in_millions:
        for i in cost_columns:
            df[i] = df[i].divide(1000000)
    else:
        df
    
   # Create columns even if they don't exist, just to harmonize 
   # before concatting.
    if 'project_county' not in df:
        df['project_county'] = "None"
    if 'project_city' not in df:
        df['project_city'] = "None"
    if 'notes' not in df:
        df['notes'] = "None" 
    
    # Determine the project completely funded or not?
    df['total_available_funds'] = df[fund_cols].sum(axis=1)
    df['fully_funded'] = df.apply(funding_vs_expenses, axis=1)
    
    # Only keep certain columns
    columns_to_keep = ['project_title','lead_agency','project_description',
                       'total_project_cost','fully_funded','total_available_funds',
                       'project_location','project_county','project_city','notes','data_source']
    df = df[columns_to_keep]
    
    # Fill in any nulls
    df = df.fillna(df.dtypes.replace({'float64': 0.0, 'object': 'None'}))

    return df

In [22]:
state_rail_plan_test = harmonizing(srp, 'lead_agency', 'project_name','project_description','total_project_cost', 'corridor', '', '', 'State Rail Plan', []) 

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  df[agency_col]


In [23]:
state_rail_plan_test.sample()

Unnamed: 0,project_title,lead_agency,project_description,total_project_cost,fully_funded,total_available_funds,project_location,project_county,project_city,notes,data_source
86,El Monte Station Ped Improvements And Siding Extension,Metrolink,"Eliminates Hold Out Rule And Adds Pedestrian Safety Treatments At Ends Of Station To Allow Meets At Mid-Platform. Extends Platform Westward, Extends Siding Track West Toward Flyover. Upgrade To Higher Speed Switch And Add New Signaling.",22.16,No available funding info,0.0,Ca High Speed Rail Phase 1 Corridor,,,,State Rail Plan


In [24]:
lost_test = harmonizing(lost, 'agency', 'project_title','project_description','cost__in_millions_', 'location', 'county','city', 'LOST', ['estimated_federal_funds', 'estimated_state_funds',
     'estimated_local_funds', 'estimated_other_funds']) 

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  df[agency_col]


In [25]:
lost_test.sample()

Unnamed: 0,project_title,lead_agency,project_description,total_project_cost,fully_funded,total_available_funds,project_location,project_county,project_city,notes,data_source
1210,Calistoga Drive From Vallejo Street To Union Road,,,0.0,No project cost info,0.0,,San Benito,,,LOST


In [26]:
def add_all_projects():
    
    # Load original dataframes
    state_rail_plan = load_state_rail_plan()
    lost = load_lost()
    
    # Clean dataframes
    state_rail_plan = harmonizing(state_rail_plan, 'lead_agency', 'project_name','project_description','total_project_cost', 'corridor', '', '', 'State Rail Plan', []) 
    lost = harmonizing(lost, 'agency', 'project_title','project_description','cost__in_millions_', 'location', 'county','city', 'LOST', ['estimated_federal_funds', 'estimated_state_funds',
       'estimated_local_funds', 'estimated_other_funds']) 
    
    # Concat
    all_projects = pd.concat([lost, state_rail_plan])
    
    return all_projects

In [27]:
all_projects_df = add_all_projects()

  cost_columns = df.columns[df.columns.str.contains("(cost|funds)")].tolist()
  df[agency_col]


In [28]:
all_projects_df.shape

(2125, 11)

In [29]:
all_projects_df.fully_funded.value_counts()

No project cost info         1089
No available funding info     839
Not fully funded              169
Fully funded                   28
Name: fully_funded, dtype: int64

In [30]:
all_projects_df.project_county.value_counts()

Imperial          726
None              276
San Mateo         169
Los Angeles       126
Alameda           102
Santa Clara       100
San Benito         85
San Diego          59
San Joaquin        56
San Bernardino     52
Sacramento         51
Tulare             49
Sonoma             49
Santa Barbara      37
Madera             36
San Francisco      28
Riverside          27
Stanislaus         20
Contra             19
Orange             19
Monterey           17
Santa Cruz          9
Marin               7
Fresno              5
Los Angelest        1
Name: project_county, dtype: int64

In [31]:
all_projects_df.lead_agency.value_counts()

None                                                                                                                              1756
Multiple                                                                                                                            46
Metrolink                                                                                                                           42
Caltrans                                                                                                                            34
Sjrrc                                                                                                                               25
Sandag                                                                                                                              22
Lossan                                                                                                                              18
Sjjpa                                                  