# Cleaning & improving old work
* Recieved a new workbook July 8, 2022.
* Create separate functions to import the actual sheets

In [1]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *

#Formatting the nb 
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

#Import script
import 1_data_prep
import crosswalks



In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"
FILE_NAME = "TIRCP_July_8_2022.xlsx"

In [3]:
#Open up the 3 sheets 
project = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Project Tracking"))
allocation =  to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Agreement Allocations"))
invoice = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Invoice Tracking Sheet"))



## Functions

In [4]:
#Some PPNO numbers are 5+. Slice them down to <= 5.
def ppno_slice(df):
    df = df.assign(ppno = df['ppno'].str.slice(start=0, stop=5))
    return df 

In [5]:
allocation = ppno_slice(allocation)
project = ppno_slice(project)

## Check PPNO 

In [6]:
PPNO_project = set(project.ppno.unique().tolist())
PPNO_allocation = set(allocation.ppno.unique().tolist())

In [7]:
#Only 3 differing PPNO numbers
PPNO_project - PPNO_allocation 

{'1155A', 'CP000', 'CP052'}

In [8]:
differences = list(PPNO_allocation - PPNO_project)
f'{len(differences)} different PPNOS.'

'13 different PPNOS.'

In [9]:
#Create a subset of allocation df with only the PPNOS that differ between the Allocation and Projects sheet
different_dfs = allocation[allocation.ppno.isin(differences)]

In [10]:
#Subset df for only rows of interest to find out which grant recipients & award years have different PPNOS
allocation_sub =different_dfs[['award_year', 'ppno','grant_recipient']].drop_duplicates()

In [11]:
allocation_sub

Unnamed: 0,award_year,ppno,grant_recipient
15,2015.0,CP002,Southern California Regional Rail Authority (Metrolink)
17,2016.0,CP018,Antelope Valley Transit Authority
33,2016.0,CP024,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
35,2016.0,CP021,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
53,2016.0,CPO02,San Joaquin Regional Rail Commission
94,2018.0,CP301,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
110,2018.0,CP042,Los Angeles-San Diego-San Luis Obispo Rail Corridor Agency (LOSSAN)
146,2018.0,CP053,Sacramento Regional Transit District
158,2018.0,CP032,San Diego Metropolitan Transit System (MTS)
299,2018.0,1155N,Transportation Agency for Monterey County


In [12]:
#get a subset of project_sub 
#project[['award_year','ppno','grant_recipient']].drop_duplicates()

### Use projects dataframe's PPNO as a source of truth
* Need to change Antelope Valley Transit Authority (AVTA) 2020's PPNO.  

In [13]:
#Crosswalk for allocation df 
ppno_crosswalk_allocation = {'CP018': 'CP019','CP024':'CP043',
                             'CP021':'CP043', 'CPO02':'CP025',
                             'CP301': 'CP031', 'CP042':'CP031',
                             'CP053': 'CP052','CP032': 'CP034',
                             '1155N':'1155A',}
allocation['ppno'].replace(ppno_crosswalk_allocation, inplace= True)                           

In [14]:
#No need for a crosswalk for projects dataframe. For some reason some read in as NaN. 
project.loc[(project["grant_recipient"] == "San Bernardino County Transportation Authority (SBCTA)"), "ppno"] = '1230'
project.loc[(project["grant_recipient"] == "Bay Area Rapid Transit District (BART)"), "ppno"] = 'CP060'
project.loc[(project["grant_recipient"] == "Santa Monica Big Blue Bus"), "ppno"] = 'CP071'

### Double check the sets

In [15]:
PPNO_project = set(project.ppno.unique().tolist())
PPNO_allocation = set(allocation.ppno.unique().tolist())

In [16]:
PPNO_allocation - PPNO_project

{'CP002', 'CP059'}

## Project

In [17]:
project.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 41 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   award_year                                                 73 non-null     int64  
 1   project_#                                                  73 non-null     int64  
 2   grant_recipient                                            73 non-null     object 
 3   project_title                                              73 non-null     object 
 4   ppno                                                       72 non-null     object 
 5   district                                                   70 non-null     object 
 6   county                                                     72 non-null     object 
 7   project_description                                        73 non-null     object 
 8   master_agree

In [18]:
project[['award_year','ppno','grant_recipient','district', 'county',
         'tircp_award_amount__$_', 'allocated_amount', 'unallocated_amount',
       'percentage_allocated', 'expended_amount', 'other_funds_involved',
       'award_cycle', 'estimated_tircp_ghg_reductions',
       'cost_per_ghg_ton_reduced', 'increased_ridership',
       'service_integration', 'improve_safety', 'project_readiness',
       'funding_leverage', 'multi_agency_coordination_integration',
       'priority_population_benefits___ab_1550_community_benefits',
       'housing_co_benefits']].head(2)


Unnamed: 0,award_year,ppno,grant_recipient,district,county,tircp_award_amount__$_,allocated_amount,unallocated_amount,percentage_allocated,expended_amount,other_funds_involved,award_cycle,estimated_tircp_ghg_reductions,cost_per_ghg_ton_reduced,increased_ridership,service_integration,improve_safety,project_readiness,funding_leverage,multi_agency_coordination_integration,priority_population_benefits___ab_1550_community_benefits,housing_co_benefits
0,2015,CP005,Antelope Valley Transit Authority (AVTA),7,LA,24403000,24403000,0,1.0,21714177.53,,1,"195,380 tons",,,,,,,,,
1,2015,CP012,Capitol Corridor Joint Powers Authority,4,VAR,4620000,4620000,0,1.0,4619999.9,,1,"356,667 tons",,,,,,,,,


In [19]:
#Fill in nulls based on data type
project.fillna(project.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64':0}), inplace=True)

#Fill in FY 
project["award_cycle"].replace({'FY 21/22': 4}, inplace=True)

In [20]:
project['award_year'].value_counts()

2018    28
2020    17
2015    14
2016    14
Name: award_year, dtype: int64

In [21]:
project['award_cycle'].value_counts()

3    28
4    17
1    14
2    14
Name: award_cycle, dtype: int64

In [22]:
#project['percentage_allocated'].value_counts()

In [23]:
project = project.add_prefix("project_")

In [24]:
test_projects = data_prep.clean_project()



In [25]:
test_projects.head(1)

Unnamed: 0,project_award_year,project_project_#,project_grant_recipient,project_project_title,project_ppno,project_district,project_county,project_project_description,project_master_agreement_number,project_master_agreement_expiration_date,project_project_manager,project_regional_coordinator,project_technical_assistance_calitp__y_n_,project_technical_assistance_fleet__y_n_,project_technical_assistance_network_integration__y_n_,project_technical_assistance_priority_population__y_n_,project_total_project_cost,project_tircp_award_amount__$_,project_allocated_amount,project_unallocated_amount,project_percentage_allocated,project_expended_amount,project_other_funds_involved,project_award_cycle,project_estimated_tircp_ghg_reductions,project_cost_per_ghg_ton_reduced,project_increased_ridership,project_service_integration,project_improve_safety,project_project_readiness,project_funding_leverage,project_multi_agency_coordination_integration,project_priority_population_benefits___ab_1550_community_benefits,project_housing_co_benefits,project_local_agency_address,project_local_agency_city,project_local_agency_zip,project_local_agency_contact,project_local_agency_email,project_local_agency_phone_number,project_comments_additional_contacts
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environmental Sustability,CP005,7,LA,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,64AVTA2015MA,2024-04-01 00:00:00,Yesenia Ochoa,Ryan Greenway,,,,,39478000,24403000,24403000,0,1.0,21714177.53,,1,"195,380 tons",,,,,,,,,,"42210 6th Street West\nLancaster, CA 93534",Lancaster,93534,Judy Fry,Jfry@avta.com,(611) 729-2234,


In [26]:
assert test_projects.shape == project.shape

In [27]:
assert set(test_projects.columns) == set(project.columns)

## Allocation Agreement 
* Function within script isn't working

In [28]:
#Some rows are not completed: drop them
allocation1 = allocation.dropna(subset=['award_year', 'grant_recipient', 'ppno'])
len(allocation1)

338

In [29]:
#Replacing values  
allocation_3rd_party_date = {'07/29/2020': '2020-07-29 00:00:00'}
allocation_led = {'2/1/2021\n\n10/31/2022': '2021-02-01 00:00:00',
                 'June 30, 2019\nSeptember 30, 2019': '2019 06-30 00:00:00',
                 'October 15, 2018\nSeptember 30, 2021': '2018-10-15 00:00:00'}
allocation_completion_date = {'6/30/2021\n12/31/2021\n10/20/2022':'2021-06-30 00:00:00',
                              'Complete\n8/30/2020':'2020-08-30 00:00:00',
                              'Complete\n1/31/2020':'2021-01-31 00:00:00'}



In [30]:
allocation1['_3rd_party_award_date'] = allocation1['_3rd_party_award_date'].replace(crosswalks.allocation_3rd_party_date)
allocation1['led'] = allocation1['led'].replace(crosswalks.allocation_led)     
allocation1['completion_date'] = allocation1['completion_date'].replace(crosswalks.allocation_completion_date) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [31]:
#allocation['_3rd_party_award_date'].value_counts()

In [32]:
#allocation['led'].value_counts()

In [33]:
#allocation['allocation_date'].value_counts()

In [34]:
#allocation['completion_date'].value_counts()

In [35]:
#allocation['date_branch_chief_receives_psa'].value_counts()

In [36]:
#allocation['expended_amount'].value_counts()

In [37]:
allocation1["expended_amount"] = (allocation1["expended_amount"]
                             .replace({'Deallocation': 0})
                             .astype('int64')
                            )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [38]:
#Coerce dates to datetime
date_columns = ['allocation_date', 'completion_date','_3rd_party_award_date', 'led', 'date_branch_chief_receives_psa',
       'date_regional_coordinator_receives_psa', 'date_oc_receives_psa',
       'date_opm_receives_psa', 'date_legal_receives_psa',
       'date_returned_to_pm',
       'date_psa_approved_by_local_agency', 'date_signed_by_drmt',
       'psa_expiry_date']

In [39]:
#Fill in NA based on data type
allocation1.fillna(allocation1.dtypes.replace({'float64': 0.0, 'object': 'None'}), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [40]:
#Coerce date columns
#https://sparkbyexamples.com/pandas/pandas-convert-multiple-columns-to-datetime-type/
for c in date_columns:
        allocation1[c] = allocation1[c].apply(pd.to_datetime, errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
allocation1 = allocation1.add_prefix("allocation_")

In [42]:
allocation1.head(2)

Unnamed: 0,allocation_award_year,allocation_project_#,allocation_grant_recipient,allocation_implementing_agency,allocation_ppno,allocation_project_id,allocation_ea,allocation_components,allocation_phase,allocation_allocation_amount,allocation_expended_amount,allocation_sb1_funding,allocation_sb1_budget_year,allocation_ggrf_funding,allocation_ggrf_budget_year,allocation_ctc_financial_resolution,allocation_ctc_allocation_amendment,allocation_ctc_waiver,allocation_calsta_waiver,allocation_allocation_date,allocation_completion_date,allocation_psa_#,allocation_ct_document_#,allocation__3rd_party_award_date,allocation_led,allocation_date_branch_chief_receives_psa,allocation_date_regional_coordinator_receives_psa,allocation_date_oc_receives_psa,allocation_date_opm_receives_psa,allocation_date_legal_receives_psa,allocation_date_returned_to_pm,allocation_date_psa_sent_to_local_agency,allocation_date_psa_approved_by_local_agency,allocation_date_signed_by_drmt,allocation_psa_expiry_date,allocation_lonp,allocation_prior_fiscal_years_to_2020,allocation_fiscal_year_2020_2021,allocation_fiscal_year_2021_2022,allocation_fiscal_year_2022_2023,allocation_fiscal_year_2023_2024,allocation_fiscal_year_2024_2025,allocation_fiscal_year_2025_2026,allocation_fiscal_year_2026_2027,allocation_fiscal_year_2027_2028,allocation_fiscal_year_2028_2029,allocation_fiscal_year_2029_2030,allocation_allocation_comments,allocation_non_network_integration_allocations_unique_percentage_split,allocation_psa_comments
0,2015.0,1.0,Antelope Valley Transit Authority,Antelope Valley Transit Authority,CP005,16000048,T343GA,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,CONST,24403000.0,21714177,0.0,,24403000.0,2015-16,TIRCP-1516-02,,,Waiver-1920-17,2015-10-22,2022-03-30,07AVTA2015PS-01 A1 \n\n07AVTA2015PS-05,07AVTA2015PS\n*Listed under Unit 3040,2016-03-14,2022-03-31,NaT,NaT,NaT,NaT,NaT,NaT,0.0,NaT,2021-02-02,NaT,,24403000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Program Supplement be Amended to show a correction in the invoicing section and to include language that confirms funding for "" supporting infrastructure"" includes WAVE.",,
1,2015.0,2.0,Capitol Corridor Joint Powers Authority,Capitol Corridor Joint Powers Authority,CP012,16000276,R350GA,Track and curve improvements between San Jose and Martinez for faster journeys benefiting Capitol Corridor passengers,CONST,4620000.0,4619999,4620000.0,2015-16,0.0,2012-13,TIRCP-1516-07\nTech. Correction June 2017,TIRCP-1920-17A\n6/25/2020,,,2016-05-19,NaT,VARCCJPAPS-01\n,VARCCJPAPS-01,2016-06-01,2019-06-01,NaT,NaT,NaT,NaT,NaT,NaT,0.0,NaT,2016-12-13,2019-06-01,,4620000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [43]:
test_allocation = data_prep.clean_allocation()

In [44]:
assert set(test_allocation.columns) == set(allocation1.columns)

In [45]:
assert set(test_allocation.shape) == set(allocation1.shape)

## SAR

In [46]:
test_projects.columns

Index(['project_award_year', 'project_project_#', 'project_grant_recipient',
       'project_project_title', 'project_ppno', 'project_district',
       'project_county', 'project_project_description',
       'project_master_agreement_number',
       'project_master_agreement_expiration_date', 'project_project_manager',
       'project_regional_coordinator',
       'project_technical_assistance_calitp__y_n_',
       'project_technical_assistance_fleet__y_n_',
       'project_technical_assistance_network_integration__y_n_',
       'project_technical_assistance_priority_population__y_n_',
       'project_total_project_cost', 'project_tircp_award_amount__$_',
       'project_allocated_amount', 'project_unallocated_amount',
       'project_percentage_allocated', 'project_expended_amount',
       'project_other_funds_involved', 'project_award_cycle',
       'project_estimated_tircp_ghg_reductions',
       'project_cost_per_ghg_ton_reduced', 'project_increased_ridership',
       'project_serv

In [47]:
#Functions
#Columns to keep
allocation_cols = ['allocation_award_year','allocation_expended_amount','allocation_allocation_amount',
                                'allocation_components','allocation_grant_recipient', 
                                   'allocation_implementing_agency','allocation_ppno',
                                   'allocation_phase','allocation_led','allocation_allocation_date',
                                   'allocation_completion_date','allocation__3rd_party_award_date',
                                  'allocation_ea', 'allocation_sb1_funding',  'allocation_ggrf_funding',
                                   'allocation_project_id']
project_cols = ['project_project_manager','project_award_year', 'project_project_#','project_project_title',
                             'project_ppno',  'project_tircp_award_amount__$_',
                             'project_expended_amount','project_allocated_amount','project_grant_recipient']

numeric_cols = ['allocation_expended_amount','allocation_allocation_amount',
            'project_tircp_award_amount__$_','project_expended_amount',
            'Percent_of_Allocation_Expended', 'Percent_of_Award_Fully_Allocated']

dates = ["allocation_allocation_date", "allocation__3rd_party_award_date",
             "allocation_completion_date", "allocation_led"]

group_by_cols = ['project_award_year','project_project_#','project_project_manager',
                 'allocation_grant_recipient', 'allocation_implementing_agency',
                 'project_project_title', 'Percent_of_Award_Fully_Allocated','TIRCP_Award_Amount',
                 'allocation_components','project_ppno','allocation_phase',
                 "allocation_allocation_date",  "CON_Contract_Award_Date",
                 "Phase_Completion_Date", 'allocation_ea', 'allocation_project_id',]

sum_cols = ['allocation_allocation_amount','allocation_expended_amount','allocation_sb1_funding',
    'allocation_ggrf_funding']

max_cols = ['Percent_of_Allocation_Expended','Allocated_Before_July_31_2020']

# https://stackoverflow.com/questions/50102808/highlighting-the-difference-between-two-dataframes
# https://stackoverflow.com/questions/56647813/perform-operations-after-styling-in-a-dataframe
def highlight_diff(current, previous, color="pink"):
    # Define html attribute
    attr = "background-color: {}".format(color)
    # Where data != other set attribute
    return pd.DataFrame(
    np.where(current.ne(previous), attr, ""),
    index=current.index,
    columns=current.columns,
        )


In [48]:
#For table 2 in semi annual report
#Input project df 
def summary_SAR_table_two(df):
    #pivot
    df = (df
          .drop_duplicates()
          .groupby(['project_award_year'])
          .agg({'project_project_#':'count',
                'project_tircp_award_amount__$_':'sum', 
                'project_allocated_amount':'sum',
                'project_expended_amount':'sum'})
          .reset_index()
         )
    #renaming columns to match report
    df = (df.rename(columns = {'project_project_#':'Number_of_Awarded_Projects',
                               'project_tircp_award_amount__$_': 'Award_Amount',
                               'project_allocated_amount':'Amount_Allocated',
                               'project_expended_amount': 'Expended_Amount',
                               'project_award_year': 'Award_Year'})
         )
    #create percentages
    df['Expended_Percent_of_Awarded'] = (df['Expended_Amount']/df['Award_Amount'])
    df['Expended_Percent_of_Allocated'] = (df['Expended_Amount']/df['Amount_Allocated'])
    df['Percent_Allocated'] = (df['Amount_Allocated']/df['Award_Amount'])
    #transpose 
    df = df.set_index('Award_Year').T
    #grand totals for monetary columns
    list_to_add = ['Award_Amount','Amount_Allocated','Expended_Amount', 'Number_of_Awarded_Projects']
    df['Grand_Total']=df.loc[list_to_add, :].sum(axis=1)
    #grand total project_expended_amount of each monetary column to fill in percentages below.
    Exp = df.at['Expended_Amount','Grand_Total']
    Alloc = df.at['Amount_Allocated','Grand_Total']
    TIRCP = df.at['Award_Amount','Grand_Total']
    #filling in totals of percentages
    df.at['Expended_Percent_of_Awarded','Grand_Total'] = (Exp/TIRCP)
    df.at['Expended_Percent_of_Allocated','Grand_Total'] = (Exp/Alloc)
    df.at['Percent_Allocated','Grand_Total'] = (Alloc/TIRCP)
    #switching rows to correct order
    df = (df.reindex(['Number_of_Awarded_Projects',
                      'Award_Amount', 'Amount_Allocated',
                     'Percent_Allocated','Expended_Amount', 
                      'Expended_Percent_of_Awarded', 'Expended_Percent_of_Allocated'])
    )
    return df 


In [49]:

def sar():
    #Load in raw sheets
    df_project = data_prep.clean_project()
    df_allocation = data_prep.clean_allocation()
    #previous_sar = data_prep.load_previous_sar()
    
    #Function for summary table portion of the report
    summary = summary_SAR_table_two(df_project)
    
    #Only keeping certain columns
    df_project = (df_project[project_cols])
    df_allocation =(df_allocation[allocation_cols])
    
    #Join the 2 dataframes
    m1 = df_allocation.merge(df_project, how = "left", 
                                 left_on = ["allocation_ppno", "allocation_award_year"],
                                 right_on = ["project_ppno", "project_award_year"])
    #drop duplicates
    m1 = m1.drop_duplicates() 
    
    #Fill in missing dates with a fake one so it'll show up in the group by 
    missing_date = pd.to_datetime('2100-01-01')
    for i in dates:
        m1[i] = (m1[i]
                     .fillna(missing_date)
                     .apply(pd.to_datetime)
                    )
    
    #Add new columns with percentages and a new column to flag whether an allocation date is 
    #AFTER  7-31-2020 then blank, if BEFORE 7-31-2020 then X
    m1 = m1.assign(
    Percent_of_Allocation_Expended = (m1['allocation_expended_amount']/
                                      m1['allocation_allocation_amount']),
    Percent_of_Award_Fully_Allocated = (m1['allocation_allocation_amount']/
                                        m1['project_tircp_award_amount__$_']),
    Allocated_Before_July_31_2020 =   m1.apply(lambda x: ' ' if x.allocation_allocation_date 
                                        > pd.Timestamp(2020, 7, 31, 0) else 'X', axis=1))
    
        
    #Filter out projects that are excluded 
    m1 = (m1[(m1.allocation_allocation_amount > 0 ) & (m1.Percent_of_Allocation_Expended < 0.99)]) 
    
    #Fill in null values based on datatype of each column
    m1 = m1.fillna(m1.dtypes.replace({'float64': 0.0, 'int64': 0}))
    
    #Rename cols 
    m1 = m1.rename(columns = {'allocation_led': 'Phase_Completion_Date',
                                      'project_tircp_award_amount__$_': 'TIRCP_Award_Amount',
                                      'allocation__3rd_party_award_date':'CON_Contract_Award_Date'})

    #Pivot
    df_pivot =m1.groupby(group_by_cols).agg({**{e:'max' for e in max_cols}, **{e:'sum' for e in sum_cols}})

    #Reset index
    df_reset = df_pivot.reset_index() 
    
    #Highlight the differences between the previous report
    #And the current report 
    '''
    current_highlighted_diffs = df_reset.style.apply(
        highlight_diff, axis=None, previous=fake_SAR, color="pink"
    )
    '''
    #Save to GCS
    '''
    with pd.ExcelWriter(f"{GCS_FILE_PATH}TESTING_Semi_Annual_Report.xlsx") as writer:
        summary.to_excel(writer, sheet_name="Summary", index=True)
        df_pivot.to_excel(writer, sheet_name="FY", index=True)
        df_reset.to_excel(
            writer, sheet_name="Unpivoted_Current_Version", index=False
        )
        current_highlighted_diffs.to_excel(
            writer, sheet_name="Highlighted_Differences", index=False
        )
    '''
    return m1, df_pivot, summary

In [50]:
df1, df2, df3 = sar()



In [51]:
len(df1)

220

In [53]:
df2.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Percent_of_Allocation_Expended,Allocated_Before_July_31_2020,allocation_allocation_amount,allocation_expended_amount,allocation_sb1_funding,allocation_ggrf_funding
project_award_year,project_project_#,project_project_manager,allocation_grant_recipient,allocation_implementing_agency,project_project_title,Percent_of_Award_Fully_Allocated,TIRCP_Award_Amount,allocation_components,project_ppno,allocation_phase,allocation_allocation_date,CON_Contract_Award_Date,Phase_Completion_Date,allocation_ea,allocation_project_id,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,object,Antelope Valley Transit Authority,Antelope Valley Transit Authority,object,0.0,0.0,Network Integration,object,CONST,2020-08-13,2020-08-14,2023-08-12,T442GB,20000277,0.0,,250000.0,0,250000.0,0.0
0.0,0.0,object,Antelope Valley Transit Authority,Antelope Valley Transit Authority,object,0.0,0.0,Purchase of 11 Zero-Emission Vehicles and Supporting Infrastructure,object,CONST,2020-08-13,2021-07-14,2024-07-14,T442GA,20000276,0.0,,6253000.0,0,3002000.0,3251000.0


## Tableau

In [55]:
test_projects.head(2)

Unnamed: 0,project_award_year,project_project_#,project_grant_recipient,project_project_title,project_ppno,project_district,project_county,project_project_description,project_master_agreement_number,project_master_agreement_expiration_date,project_project_manager,project_regional_coordinator,project_technical_assistance_calitp__y_n_,project_technical_assistance_fleet__y_n_,project_technical_assistance_network_integration__y_n_,project_technical_assistance_priority_population__y_n_,project_total_project_cost,project_tircp_award_amount__$_,project_allocated_amount,project_unallocated_amount,project_percentage_allocated,project_expended_amount,project_other_funds_involved,project_award_cycle,project_estimated_tircp_ghg_reductions,project_cost_per_ghg_ton_reduced,project_increased_ridership,project_service_integration,project_improve_safety,project_project_readiness,project_funding_leverage,project_multi_agency_coordination_integration,project_priority_population_benefits___ab_1550_community_benefits,project_housing_co_benefits,project_local_agency_address,project_local_agency_city,project_local_agency_zip,project_local_agency_contact,project_local_agency_email,project_local_agency_phone_number,project_comments_additional_contacts
0,2015,1,Antelope Valley Transit Authority (AVTA),Regional Transit Interconnectivity & Environmental Sustability,CP005,7,LA,Purchase 13 60-foot articulated BRT buses and 16 45-foot electric commuter buses,64AVTA2015MA,2024-04-01 00:00:00,Yesenia Ochoa,Ryan Greenway,,,,,39478000,24403000,24403000,0,1.0,21714177.53,,1,"195,380 tons",,,,,,,,,,"42210 6th Street West\nLancaster, CA 93534",Lancaster,93534,Judy Fry,Jfry@avta.com,(611) 729-2234,
1,2015,2,Capitol Corridor Joint Powers Authority,Travel Time Reduction Project,CP012,4,VAR,"Track and curve improvements between San Jose and Martinez for faster journeys benefiting Capitol Corridor, ACE, and San Joaquins passengers",64CCJPA2022MA,2032-05-01 00:00:00,Doug Adams,Shannon Simonds,No,No,No,No,5420700,4620000,4620000,0,1.0,4619999.9,,1,"356,667 tons",,,,,,,,,,"BART 300 Lakeside, LKS-22",PO Box 12688\nOakland,94604-2688,Jim Allison,JimA@capitolcorridor.org,510.464.6994,


In [None]:
def tableau(df):  # DELETE DF() LATER
    df.columns = df.columns.str.replace("Project_", "")
    # Keeping only certain columns.
    df = df[
        [
            "PPNO",
            "Award_Year",
            "#",
            "Grant_Recipient",
            "Title",
            "District",
            "County",
            "Description",
            "Master_Agreement_Number",
            "Master_Agreement_Expiration_Date",
            "Manager",
            "Regional_Coordinator",
            "Technical_Assistance-Fleet_(Y/N)",
            "Technical_Assistance-Network_Integration_(Y/N)",
            "Technical_Assistance-Priority_Population_(Y/N)",
            "Total_Cost",
            "Technical_Assistance-CALITP_(Y/N)",
            "TIRCP_Award_Amount_($)",
            "Allocated_Amount",
            "Expended_Amount",
            "Other_Funds_Involved",
        ]
    ]

    # Rename
    df = df.rename(
        columns={"TIRCP_Award_Amount_($)": "TIRCP_Amount", "Title": "Project_Title"}
    )

    # Getting percentages & filling in with 0
    df["Expended_Percent"] = df["Expended_Amount"] / df["Allocated_Amount"]
    df["Allocated_Percent"] = df["Allocated_Amount"] / df["TIRCP_Amount"]

    # Subtract TIRCP with Allocated Amount with Unallocated
    df["Unallocated_Amount"] = df["TIRCP_Amount"] - df["Allocated_Amount"]
    # filling in for 0's
    df[["Expended_Percent", "Allocated_Percent"]] = df[
        ["Expended_Percent", "Allocated_Percent"]
    ].fillna(value=0)
    df[["Expended_Percent", "Allocated_Percent"]] = df[
        ["Expended_Percent", "Allocated_Percent"]
    ].replace(np.inf, 0)

    # Categorizing expended percentage into bins
    def expended_percent(row):

        if (row.Expended_Percent > 0) and (row.Expended_Percent < 0.26):
            return "1-25"
        elif (row.Expended_Percent > 0.25) and (row.Expended_Percent < 0.51):
            return "26-50"
        elif (row.Expended_Percent > 0.50) and (row.Expended_Percent < 0.76):
            return "51-75"
        elif (row.Expended_Percent > 0.75) and (row.Expended_Percent < 1.0):
            return "76-99"
        elif row.Expended_Percent == 0.0:
            return "0"
        else:
            return "100"

    df["Expended_Percent_Group"] = df.apply(lambda x: expended_percent(x), axis=1)

    # Categorize years and expended_percent_group into bins
    def progress(df):
        ### 2015 ###
        if (df["Award_Year"] == 2015) and (df["Expended_Percent_Group"] == "1-25") | (
            df["Expended_Percent_Group"] == "26-50"
        ):
            return "Behind"
        elif (df["Award_Year"] == 2015) and (
            df["Expended_Percent_Group"] == "76-99"
        ) | (df["Expended_Percent_Group"] == "51-75"):
            return "On Track"

        ### 2016 ###
        elif (df["Award_Year"] == 2016) and (df["Expended_Percent_Group"] == "1-25") | (
            df["Expended_Percent_Group"] == "26-50"
        ):
            return "Behind"
        elif (df["Award_Year"] == 2016) and (
            df["Expended_Percent_Group"] == "51-75"
        ) | (df["Expended_Percent_Group"] == "76-99"):
            return "On Track"

        ### 2018 ###
        elif (df["Award_Year"] == 2018) and (df["Expended_Percent_Group"] == "1-25"):
            return "Behind"
        elif (df["Award_Year"] == 2018) and (
            df["Expended_Percent_Group"] == "26-50"
        ) | (df["Expended_Percent_Group"] == "51-75"):
            return "On Track"
        elif (df["Award_Year"] == 2018) and (df["Expended_Percent_Group"] == "76-99"):
            return "Ahead"

        ### 2020 ###
        elif (df["Award_Year"] == 2020) and (df["Expended_Percent_Group"] == "1-25"):
            return "Behind"
        elif (df["Award_Year"] == 2020) and (df["Expended_Percent_Group"] == "26-50"):
            return "On Track"
        elif (df["Award_Year"] == 2020) and (
            df["Expended_Percent_Group"] == "51-75"
        ) | (df["Expended_Percent_Group"] == "76-99"):
            return "Ahead"

        ### 0 Expenditures ###
        elif df["Expended_Percent_Group"] == "0":
            return "No expenditures recorded"

        ### Else ###
        else:
            return "100% of allocated funds spent"

    df["Progress"] = df.apply(progress, axis=1)

    # Which projects are large,small, medium
    p75 = df.TIRCP_Amount.quantile(0.75).astype(float)
    p25 = df.TIRCP_Amount.quantile(0.25).astype(float)
    p50 = df.TIRCP_Amount.quantile(0.50).astype(float)

    def project_size(row):
        if (row.TIRCP_Amount > 0) and (row.TIRCP_Amount < p25):
            return "Small"
        elif (row.TIRCP_Amount > p25) and (row.TIRCP_Amount < p75):
            return "Medium"
        elif (row.TIRCP_Amount > p50) and (row.TIRCP_Amount > p75):
            return "Large"
        else:
            return "$0 recorded for TIRCP"

    df["Project_Category"] = df.apply(lambda x: project_size(x), axis=1)

    ### GCS ###
    # with pd.ExcelWriter(f"{GCS_FILE_PATH}Tableau_Sheet.xlsx") as writer:
    # df.to_excel(writer, sheet_name="Data", index=False)
    # return df

    return df