# Testing the scripts for Consolidated App

In [None]:
import os
import re as re
from collections import Counter
from itertools import chain, combinations

import geopandas as gpd
import numpy as np
import pandas as pd
import shared_utils
from calitp import *
from shared_utils import utils
from siuba import *

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/consolidated_applications/"
Caltrans_shape = "https://gis.data.ca.gov/datasets/0144574f750f4ccc88749004aca6eb0c_0.geojson?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D"

from calitp.storage import get_fs

fs = get_fs()

In [None]:
# For NB only
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format
pd.options.display.max_rows = 400
import data_cleaning
import utilities

In [None]:
original_df = data_cleaning.load_con_app()

In [None]:
cleaned_df = data_cleaning.initial_cleaning(original_df)

In [None]:
cleaned_df.head(2)

In [None]:
gdf_test = gdf_conapp(cleaned_df)

In [None]:
type(gdf_test)

## Testing imported script

In [None]:
df1, df2, df3, df4 = data_cleaning.conapp_complete_report()

In [None]:
df1.shape

In [None]:
df3

In [None]:
df1.head(2)

In [None]:
df2.head(2)

In [None]:
df3.head(2)

In [None]:
FILE_NAME = "Con_App_Cleaned.xlsx"

df5 = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="cleaned_unpivoted_data") 
    
df6 = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="pivoted_data")


df7 = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="combos_of_funding_programs"
)

In [None]:
df5.head(2)

In [None]:
df6.head(2)

In [None]:
df7.head(2)

In [None]:
assert set(df1.columns) == set(df5.columns)
assert df1.shape == df5.shape

In [None]:
df1.columns

In [None]:
cols = ['total_expenses', '_5311_funds',
       '_5311_f__funds', '_5311_cmaq_funds', '_5339_funds', 'federal_total',
       'other_fed_funds_total', 'lctop__state__funds',
       'sb1__state_of_good_repair__state__funds',
       'transit_development_act__state__funds', 'other_state_funds',
       'state_total', 'local_total', 'total_state_federal_local_funding', 'total_state_fed_only']

In [None]:
for i in [df1, df5]:
    for j in cols:
        # print("\n" + i)
        print(i[j].sum())

In [None]:
assert set(df2.columns) == set(df6.columns)
assert df2.shape == df6.shape

In [None]:
for i in [df2, df6]:
    for j in ['total_state_federal_local_funding', 'total_expenses', 'funding_received']:
        # print("\n" + i)
        print(i[j].sum())

In [None]:
assert set(df3.columns) == set(df7.columns)
assert df3.shape == df7.shape

In [None]:
for i in [df3, df7]:
    print(i['count_of_funding_programs_applied'].sum())

### More  checks against original dfs 

In [None]:
assert set(df9.columns) == set(cleaned_unpivoted_og.columns)
assert df9.shape == cleaned_unpivoted_og.shape

In [None]:
assert set(df10.columns) == set(melted_df_og.columns)
assert df10.shape == melted_df_og.shape

In [None]:
assert set(df11.columns) == set(grouped_og.columns)
assert df11.shape == grouped_og.shape

## Load in original sheets

In [None]:
FILE_NAME = "Con_App_Cleaned.xlsx"
melted_df_og = pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="pivoted_data")

cleaned_unpivoted_og = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="cleaned_unpivoted_data"
)
grouped_og = pd.read_excel(
    f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="combos_of_funding_programs"
)
gdf_og = gpd.read_parquet(f"{GCS_FILE_PATH}con_app_gdf.parquet")

## Functions & Lists

In [None]:
#Function for fully funded or not
def funding_vs_expenses(df):
    if df["total_state_federal_local_funding"] == df["total_expenses"]:
        return "Fully funded"
    elif df["total_state_federal_local_funding"] > df["total_expenses"]:
        return "Funding exceeds total expenses"
    else:
        return "Not fully funded"

In [None]:
# Function to clean agency/organization names 
def organization_cleaning(df, column_wanted: str):
    df[column_wanted] = (
        df[column_wanted]
        .str.strip()
        .str.split(",")
        .str[0]
        .str.replace("/", "")
        .str.split("(")
        .str[0]
        .str.split("/")
        .str[0]
        .str.title()
        .str.replace("Trasit", "Transit")
        .str.strip() #strip again after getting rid of certain things
    )
    return df

In [None]:
project_keywords = "(operating|bus|construction|buses|planning|van|vessel|fare|ridership|vehicle|station|service|equipment|maintenance|surveillance|renovate|free|equip|operational)"

In [None]:
project_dictionary =  {
        "operating": "operating assistance",
        "operational": "operating assistance",
        "free": "free fare program",
        "ridership": "ridership expansion",
        "fare": "purchasing other tech",
        "service": "service expansion",
        "buses": "purchasing vehicles",
        "bus": "purchasing vehicles",
        "van": "purchasing vehicles",
        "vessel": "purchasing vehicles",
        "vehicles": "purchasing vehicles",
        "vehicle": "purchasing vehicles",
        "planning": "transit planning",
        "station": "construction",
        "construction": "construction",
        "maintenance": "maintenance/renovation",
        "renovate": "maintenance/renovation",
        "equipment": "purchasing other tech",
        "equip": "purchasing other tech",
        "surveillance": "purchasing other tech"}

In [None]:
district_dictionary = {
        7: "District 7: Los Angeles",
        4: "District 4: Bay Area / Oakland",
        2: "District 2: Redding",
        9: "District 9: Bishop",
        10: "District 10: Stockton",
        11: "District 11: San Diego",
        3: "District 3: Marysville / Sacramento",
        12: "District 12: Orange County",
        8: "District 8: San Bernardino / Riverside",
        5: "District 5: San Luis Obispo / Santa Barbara",
        6: "District 6: Fresno / Bakersfield",
        1: "District 1: Eureka",
    }

In [None]:
        
    #List of original monetary columns: excludes ones I've created
    monetary_cols = [
    "total_expenses",
    "_5311_funds",
    "_5311_f__funds",
    "_5311_cmaq_funds",
    "_5339_funds",
    "federal_total",
    "other_fed_funds_total",
    "lctop__state__funds",
    "sb1__state_of_good_repair__state__funds",
    "transit_development_act__state__funds",
    "other_state_funds",
    "state_total"]

## Function for loading in from  excel

In [None]:
def load_con_app():
    #We keep all the columns 
    con_app_file =  "Copy of Application_Review_Report_5_2_2022.xls"
    con_app =  to_snakecase(
    pd.read_excel(f"{GCS_FILE_PATH}{con_app_file}"))
    return con_app

In [None]:
df1 = load_con_app()

In [None]:
df1.shape

## Function for initial cleaning 

In [None]:
def initial_cleaning(df):
    
    # Clean organization names
    df = organization_cleaning(df, "organization_name")
   
    # PROJECT CATEGORIES
    # Spell out project categories 
    df["project_category"] = df["project_category"].replace(
    {"OP": "Operating", "CA": "Capital", "PL": "Planning", "CM": "Capital Maintenance"})
    
    # Project categories are pretty vague, but project description has 200+ diff inputs
    # Search through descriptions for the keywords below and input keyword into the new column "short description"
    # Lower project description
    df["project_description"] = df["project_description"].str.lower()
    
    # Extract keywords that pop up often
    df["short_description"] = df["project_description"].str.extract(project_keywords, expand=False)
    
    # Replace the keywords with the main categories
    # Capture any entries that don't fall into a particular category as "other" 
    df["short_description"] = (df["short_description"]
                               .replace(project_dictionary)
                               .fillna("other category")
                               .str.title()
                              )
    
    # MONETARY COLS 
    # Local totals: split on ":" and extract only the last item
    # To grab the total of local funding a proejct has
    df["local_total"] = df["local_total"].str.split(": ").str[-1]
    
    #Remove characters and turn column from str into float
    df["local_total"] = (
    df["local_total"]
    .str.replace(",", "", regex=True)
    .str.replace("$", "", regex= True)
    .fillna(0)
    .astype("float")) 
    
    #Clean all monetary columns up
    df[monetary_cols] = (
    df[monetary_cols]
    .fillna(value=0)
    .apply(pd.to_numeric, errors="coerce")
    .astype("float"))
    
    # Create two new cols: total for state and fed
    # and total for state and local funds only
    df = df.assign(
    total_state_federal_local_funding = (df["state_total"]
    + df["local_total"]
    + df["federal_total"]
    + df["other_fed_funds_total"]),    
    total_state_fed_only = 
    (df["state_total"] + df["federal_total"])) 
    
    #Apply function to determine if a project is fully funded or not
    df['fully_funded'] = df.apply(funding_vs_expenses, axis=1)
    
    # DISTRICTS 
    # Create new column with fully spelled out names
    df["full_district_name"] = df["district"].replace(district_dictionary)
    
    # MANUAL
    # Will change each year
    # Replace Ventura County since it read in strangely
    df["organization_name"] = df["organization_name"].replace(
    {"Ventura County Transportation Commission\xa0": "Ventura County Transportation Commission"}).str.replace(
    "\s+\(.*$", "", regex=True)
    
    #Replace the districts by organization names
    df.loc[(df["organization_name"] == "City of Banning"), "district"] = 8
    df.loc[(df["organization_name"] == "City of Clovis"), "district"] = 6
    df.loc[(df["organization_name"] == "City of Los Angeles DOT"), "district"] = 7
    df.loc[(df["organization_name"] == "Peninsula Corridor Joint Powers Board"), "district"] = 4
    df.loc[(df["organization_name"] == "San Joaquin Regional Rail Commission"), "district"] = 10
    df.loc[(df["organization_name"] == "Western Contra Costa Transit Authority"), "district"] = 4
    
    return df

In [None]:
df2 = initial_cleaning(df1)

### Different than original df because some cols are int64, some are int32

In [None]:
assert set(df2.columns) == set(cleaned_unpivoted_og.columns)
assert df2.shape == cleaned_unpivoted_og.shape

In [None]:
cleaned_unpivoted_og.head(3)

In [None]:
df2.head(3)

In [None]:
cleaned_unpivoted_og['district'].dtype

In [None]:
df2['district'].dtype

In [None]:
df2.equals(cleaned_unpivoted_og)

In [None]:
df2_numeric = df2[['_5311_funds',
       '_5311_f__funds', '_5311_cmaq_funds', '_5339_funds', 'federal_total',
       'other_fed_funds_total', 'lctop__state__funds',
       'sb1__state_of_good_repair__state__funds',
       'transit_development_act__state__funds', 'other_state_funds',
       'state_total', 'local_total', 
       'total_state_federal_local_funding', 'total_state_fed_only',
       ]].astype('int64')

In [None]:
cleaned_unpivoted_og_numeric = cleaned_unpivoted_og[['_5311_funds',
       '_5311_f__funds', '_5311_cmaq_funds', '_5339_funds', 'federal_total',
       'other_fed_funds_total', 'lctop__state__funds',
       'sb1__state_of_good_repair__state__funds',
       'transit_development_act__state__funds', 'other_state_funds',
       'state_total', 'local_total', 
       'total_state_federal_local_funding', 'total_state_fed_only',]].astype('int64')

In [None]:
df2_numeric.equals(cleaned_unpivoted_og_numeric)

## Melted DF Function

In [None]:
subset_for_melted_df = [
        "project_upin",
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
        "other_state_funds",
        "other_fed_funds_total",
        "local_total",
        "federal_total",
        "state_total",
    ]

In [None]:
list_for_value_vars = [
        "_5311_funds",
        "_5311_f__funds",
        "_5311_cmaq_funds",
        "_5339_funds",
        "lctop__state__funds",
        "sb1__state_of_good_repair__state__funds",
        "transit_development_act__state__funds",
        "other_state_funds",
        "other_fed_funds_total",
        "local_total",
        "federal_total",
        "state_total",
    ]

In [None]:
subset_og_df = [
        "total_expenses",
        "organization_name",
        "district",
        "full_district_name",
        "year",
        "application_status",
        "project_upin",
        "project_category",
        "project_line_item__ali_",
        "project_description",
        "is_stimulus",
        "total_state_federal_local_funding",
        "fully_funded",
        "short_description",
    ]

In [None]:
def melt_df(df): 
    # Keep only subset of what I want to melt & the identifier column 
    # Which is the project_upin: a unique identifier for each product
    
    melt_subset1 = df[subset_for_melted_df]
    
    # Melt the df: put funds (value_vars) beneath value_name and the
    # associated funding amounts under the column "funding received."
    melt_subset2 = pd.melt(
    melt_subset1, #subsetted df 
    id_vars=["project_upin"],
    value_vars= list_for_value_vars,
    var_name="program_name",
    value_name="funding_received",)
    
    # Create a subset of the original df 
    # To merge it onto our melted df so we can info such as project description
    # Fully funded or not, etc 
    df2 = df[subset_og_df]
    
    # Left merge with melted dataframe, which will has MANY more lines 
    m1 = pd.merge(melt_subset2, df2, on="project_upin", how="left")
    
    # Rename funds for clarity 
    m1["program_name"] = m1["program_name"].replace(
    {
        "_5311_funds": "5311 (Fed)",
        "lctop__state__funds": "LCTOP (State)",
        "transit_development_act__state__funds": "Transit Development Act (State)",
        "other_state_funds": "Other State Funds",
        "_5339_funds": "5339 (Fed)",
        "_5311_f__funds": "5311(f) (Fed)",
        "sb1__state_of_good_repair__state__funds": "SB1. State of Good Repair (State)",
        "other_fed_funds_total": "Other Federal Funds",
        "_5311_cmaq_funds": "5311 CMAQ (Fed)",
        "local_total": "Local Funds",
        "federal_total": "Federal Total",
        "state_total": "State Total",
    }) 
    
    # Filter out excess rows with $0 in the col "funding_received"
    # To shorten dataframe 
    m1 = m1[m1["funding_received"] > 0]
    
    return m1

In [None]:
df3 = melt_df(df2)

In [None]:
df3.shape

### Check with original
* Still different but the grouped DF below built from this DF is the same as my original work?

In [None]:
df3['total_expenses'] = df3['total_expenses'].astype('int64')

In [None]:
df3.head(3)

In [None]:
melted_df_og.head(3)

In [None]:
melted_df_og.equals(df3)

In [None]:
cols = ['project_upin','funding_received', 'total_expenses',
       'district',  'year',
     
       'total_state_federal_local_funding', 'fully_funded',
       ]

In [None]:
def pick_column_and_aggregate(df1, df2, col):
    if df1[col].sum() == df2[col].sum():
        print("PASS")
    else:
        print(f"{col}: FAIL")

In [None]:
for c in cols:
    pick_column_and_aggregate(df3, melted_df_og, c)

In [None]:
assert set(df3.columns) == set(melted_df_og.columns)
assert df3.shape == melted_df_og.shape

In [None]:
set(df3.columns).difference(set(melted_df_og.columns))

## Grouped DF 

In [None]:
def group_df(melted_df, initial_clean_df):
    # Exclude totals: not a fund 
    grouped1 = melted_df.loc[
    ~melted_df["program_name"].isin(
        [
            "Local Funds",
            "Federal Total",
            "State Total",]
    )]
    
    #Grab all the different program names by project upin and put it in a new column
    grouped1["all_programs"] = grouped1.groupby("project_upin")["program_name"].transform(
    lambda x: ",".join(x))
    
    #Keep only cols of interest & drop duplicates
    grouped1 = grouped1[["project_upin", "all_programs"]].drop_duplicates()
    
    # Merge with original dataframe because above we only have project_upin and all the funds left
    grouped2 = pd.merge(grouped1, initial_clean_df, on="project_upin", how="left")
    
    # Keep only relevant cols
    grouped2 = grouped2[
    ["project_upin", "organization_name", "project_description", "all_programs", "year"]]
    
    # Count number of funds under "all programs" column to get a metric of how many funds 
    # an orgs want for a particular project
    # https://stackoverflow.com/questions/51502263/pandas-dataframe-object-has-no-attribute-str
    grouped2["count_of_funding_programs_applied"] = (
    grouped2["all_programs"]
    .str.split(",+")
    .str.len()
    .groupby(grouped2.project_upin)
    .transform("sum"))
    
    return grouped2

In [None]:
df4 = group_df(df3, df2)

In [None]:
df4.head()

grouped_og.head()

In [None]:
assert set(df4.columns) == set(grouped_og.columns)
assert df4.shape == grouped_og.shape

In [None]:
df4.equals(grouped_og)

In [None]:
grouped_og.shape

## Geodataframe
* This one is not saving to the GCS.

In [None]:
def gdf_conapp(df):
    #Load geojson with the shapes of the Caltrans districts
    geojson = (gpd.read_file(f'{Caltrans_shape}')
               .to_crs(epsg=4326))
    
    #Keep only the columns we want 
    geojson = geojson[["DISTRICT", "Shape_Length", "Shape_Area", "geometry"]]
    
    #Take the cleaned, unaggregated dataframe and get summarize statistics
    summarized = df.groupby("district").agg(
    {"project_upin": "count", "total_state_fed_only": "sum"}).reset_index()
    
    #New column that rounds total_state_fed to millions
    summarized["funding_millions"] = (
    "$"
    + (summarized["total_state_fed_only"].astype(float) / 1000000)
    .round()
    .astype(str)
    + "M")
    
    #For the map, it looks nicer when the legend is pinned to percentiles instead of 
    #actual dollar amounts.
    p75 = summarized.total_state_fed_only.quantile(0.75).astype(float)
    p25 =summarized.total_state_fed_only.quantile(0.25).astype(float)
    p50 = summarized.total_state_fed_only.quantile(0.50).astype(float)
    
    #Function for mapping percentiles 
    def funding_range(row):
        if ((row.total_state_fed_only > 0) and (row.total_state_fed_only < p25)):
            return "25"
        elif ((row.total_state_fed_only > p25) and (row.total_state_fed_only < p75)):
            return "50"
        elif ((row.total_state_fed_only > p50) and (row.total_state_fed_only > p75 )):
               return "75"
        else:
            return "No Info"
        
    #Apply the aforementioned function into a new column
    summarized["funding_percentile"] = summarized.apply(lambda x: funding_range(x), axis=1)
    
    #Merge geojson with the summarized df
    gdf = geojson.merge(
    summarized, how="inner", left_on="DISTRICT", right_on="district") 
    
    #Export 
    shared_utils.utils.geoparquet_gcs_export(gdf, f'{GCS_FILE_PATH}',
    "script_con_app_gdf")
    
    return summarized

In [None]:
gdf1 = gdf_conapp(df2)

In [None]:
type(gdf1)

In [None]:
gdf1

## One function to capture them all 

In [None]:
def con_app_complete_clean():
    #Load in original sheet
    raw_con_app = load_con_app() 
    
    #Do the initial cleaning
    cleaned_con_app = initial_cleaning(raw_con_app) 
    
    #First aggregation: melting the dataframe
    melted_df = melt_df(cleaned_con_app)
    
    #Second aggregation: putting all funding programs onto a single line  
    grouped_df = group_df(melted_df, cleaned_con_app)
    
    #Third aggregation: summarize and turn it into a gdf that will be saved
    #As a geoparquet to GCS
    gdf = gdf_conapp(cleaned_con_app)
    
    """
    Write the first 3 dfs into an Excel workbook in case and save to GCS
    with pd.ExcelWriter(f"{GCS_FILE_PATH}Script_Testing.xlsx") as writer:
        melted_df.to_excel(writer, sheet_name="pivoted_data", index=False)
        cleaned_con_app.to_excel(writer, sheet_name="cleaned_unpivoted_data", index=False)
        grouped_df.to_excel(writer, sheet_name="combos_of_funding_programs", index=False)
    """
    return cleaned_con_app, melted_df, grouped_df,  gdf


In [None]:
df5, df6, df7, df8 = con_app_complete_clean()

In [None]:
df5.head(2)

In [None]:
df6.head(2)

In [None]:
df7.head(2)

In [None]:
df8.head(2)