# Cleaning & improving old work
* Recieved a new workbook July 8, 2022.
* Create separate functions to import the actual sheets

In [1]:
import numpy as np
import pandas as pd
from siuba import *
from calitp import *

#Formatting the nb 
pd.options.display.max_columns = 100
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)
pd.options.display.float_format = "{:.2f}".format

#Import script
import A1_data_prep
import A2_tableau
import A3_semiannual_report

import crosswalks



In [2]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/tircp/"
FILE_NAME = "TIRCP_July_8_2022.xlsx"

In [3]:
#Open up the 3 sheets 
project = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Project Tracking"))
allocation =  to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Agreement Allocations"))
invoice = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}{FILE_NAME}", sheet_name="Invoice Tracking Sheet"))



## Functions

In [4]:
#Some PPNO numbers are 5+. Slice them down to <= 5.
def ppno_slice(df):
    df = df.assign(ppno = df['ppno'].str.slice(start=0, stop=5))
    return df 

In [5]:
allocation = ppno_slice(allocation)
project = ppno_slice(project)

In [6]:
script_SAR = A3_semiannual_report.full_sar_report()



## Check PPNO 

In [None]:
PPNO_project = set(project.ppno.unique().tolist())
PPNO_allocation = set(allocation.ppno.unique().tolist())

In [None]:
#Only 3 differing PPNO numbers
PPNO_project - PPNO_allocation 

In [None]:
differences = list(PPNO_allocation - PPNO_project)
f'{len(differences)} different PPNOS.'

In [None]:
differences

In [None]:
#Create a subset of allocation df with only the PPNOS that differ between the Allocation and Projects sheet
different_dfs = allocation[allocation.ppno.isin(differences)]

In [None]:
#Subset df for only rows of interest to find out which grant recipients & award years have different PPNOS
allocation_sub =different_dfs[['award_year', 'ppno','grant_recipient']].drop_duplicates()

In [None]:
#allocation_sub

### Use projects dataframe's PPNO as a source of truth
* Need to change Antelope Valley Transit Authority (AVTA) 2020's PPNO.  

In [None]:
#Crosswalk for allocation df 
ppno_crosswalk_allocation = {'CP018': 'CP019','CP024':'CP043',
                             'CP021':'CP043', 'CPO02':'CP025',
                             'CP301': 'CP031', 'CP042':'CP031',
                             'CP053': 'CP052','CP032': 'CP034',
                             '1155N':'1155A','CP002':'CP000'}
allocation.loc[(allocation["grant_recipient"] == "San Bernardino County Transportation Authority (SBCTA)") & (allocation['award_year'] == 2016), "ppno"] = 1230
allocation = allocation.sort_values(['award_year','grant_recipient'])
allocation['ppno'].replace(ppno_crosswalk_allocation, inplace= True)      


In [None]:
allocation['ppno'] = allocation['ppno'].fillna(method="ffill")

In [None]:
#No need for a crosswalk for projects dataframe. For some reason some read in as NaN. 
project.loc[(project["grant_recipient"] == "San Bernardino County Transportation Authority (SBCTA)"), "ppno"] = 1230
project.loc[(project["grant_recipient"] == "Bay Area Rapid Transit District (BART)"), "ppno"] = 'CP060'
project.loc[(project["grant_recipient"] == "Santa Monica Big Blue Bus"), "ppno"] = 'CP071'
project.loc[(project["grant_recipient"] == "Antelope Valley Transit Authority (AVTA)") & (project['award_year'] == 2020), "ppno"] = "CP059"

### Double check the sets

In [None]:
PPNO_project = set(project.ppno.unique().tolist())
PPNO_allocation = set(allocation.ppno.unique().tolist())

In [None]:
PPNO_allocation - PPNO_project

In [None]:
PPNO_project - PPNO_allocation

In [None]:
allocation[['award_year','ppno','grant_recipient']].drop_duplicates()

In [None]:
#preview subset of project_sub 
project[['award_year','ppno','grant_recipient']].drop_duplicates()

## Project

In [None]:
project[['award_year','ppno','grant_recipient','district', 'county',
         'tircp_award_amount__$_', 'allocated_amount', 'unallocated_amount',
       'percentage_allocated', 'expended_amount', 'other_funds_involved',
       'award_cycle', 'estimated_tircp_ghg_reductions',
       'cost_per_ghg_ton_reduced', 'increased_ridership',
       'service_integration', 'improve_safety', 'project_readiness',
       'funding_leverage', 'multi_agency_coordination_integration',
       'priority_population_benefits___ab_1550_community_benefits',
       'housing_co_benefits']].head(2)


In [None]:
#Fill in nulls based on data type
project.fillna(project.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64':0}), inplace=True)

#Fill in FY 
project["award_cycle"].replace({'FY 21/22': 4}, inplace=True)

In [None]:
project['award_year'].value_counts()

In [None]:
project['award_cycle'].value_counts()

In [None]:
#project['percentage_allocated'].value_counts()

In [None]:
project = project.add_prefix("project_")

In [None]:
script_projects = A1_data_prep.clean_project()

In [None]:
assert script_projects.shape == project.shape

In [None]:
assert set(script_projects.columns) == set(project.columns)

## Allocation Agreement 
* Function within script isn't working

In [None]:
#Some rows are not completed: drop them
allocation1 = allocation.dropna(subset=['award_year', 'grant_recipient', 'ppno'])
len(allocation1)

In [None]:
#Replacing values  
allocation_3rd_party_date = {'07/29/2020': '2020-07-29 00:00:00'}
allocation_led = {'2/1/2021\n\n10/31/2022': '2021-02-01 00:00:00',
                 'June 30, 2019\nSeptember 30, 2019': '2019 06-30 00:00:00',
                 'October 15, 2018\nSeptember 30, 2021': '2018-10-15 00:00:00'}
allocation_completion_date = {'6/30/2021\n12/31/2021\n10/20/2022':'2021-06-30 00:00:00',
                              'Complete\n8/30/2020':'2020-08-30 00:00:00',
                              'Complete\n1/31/2020':'2021-01-31 00:00:00'}



In [None]:
allocation1['_3rd_party_award_date'] = allocation1['_3rd_party_award_date'].replace(crosswalks.allocation_3rd_party_date)
allocation1['led'] = allocation1['led'].replace(crosswalks.allocation_led)     
allocation1['completion_date'] = allocation1['completion_date'].replace(crosswalks.allocation_completion_date) 

In [None]:
#allocation['_3rd_party_award_date'].value_counts()

In [None]:
#allocation['led'].value_counts()

In [None]:
#allocation['allocation_date'].value_counts()

In [None]:
#allocation['completion_date'].value_counts()

In [None]:
#allocation['date_branch_chief_receives_psa'].value_counts()

In [None]:
#allocation['expended_amount'].value_counts()

In [None]:
allocation1["expended_amount"] = (allocation1["expended_amount"]
                             .replace({'Deallocation': 0})
                             .astype('int64')
                            )

In [None]:
#Coerce dates to datetime
date_columns = ['allocation_date', 'completion_date','_3rd_party_award_date', 'led', 'date_branch_chief_receives_psa',
       'date_regional_coordinator_receives_psa', 'date_oc_receives_psa',
       'date_opm_receives_psa', 'date_legal_receives_psa',
       'date_returned_to_pm',
       'date_psa_approved_by_local_agency', 'date_signed_by_drmt',
       'psa_expiry_date']

In [None]:
#Fill in NA based on data type
allocation1.fillna(allocation1.dtypes.replace({'float64': 0.0, 'object': 'None'}), inplace=True)

In [None]:
#Coerce date columns
#https://sparkbyexamples.com/pandas/pandas-convert-multiple-columns-to-datetime-type/
for c in date_columns:
        allocation1[c] = allocation1[c].apply(pd.to_datetime, errors='coerce')

In [None]:
allocation1 = allocation1.add_prefix("allocation_")

In [None]:
script_allocation = A1_data_prep.clean_allocation()

In [None]:
assert set(script_allocation.columns) == set(allocation1.columns)

In [None]:
script_allocation.shape 

In [None]:
allocation1.shape

## PAP

## SAR

In [None]:
#Functions
#Columns to keep
allocation_cols = ['allocation_award_year','allocation_expended_amount','allocation_allocation_amount',
                                'allocation_components','allocation_grant_recipient', 
                                   'allocation_implementing_agency','allocation_ppno',
                                   'allocation_phase','allocation_led','allocation_allocation_date',
                                   'allocation_completion_date','allocation__3rd_party_award_date',
                                  'allocation_ea', 'allocation_sb1_funding',  'allocation_ggrf_funding',
                                   'allocation_project_id']
project_cols = ['project_project_manager','project_award_year', 'project_project_#','project_project_title',
                             'project_ppno',  'project_tircp_award_amount__$_',
                             'project_expended_amount','project_allocated_amount','project_grant_recipient']

numeric_cols = ['allocation_expended_amount','allocation_allocation_amount',
            'project_tircp_award_amount__$_','project_expended_amount',
            'Percent_of_Allocation_Expended', 'Percent_of_Award_Fully_Allocated']

dates = ["allocation_allocation_date", "allocation__3rd_party_award_date",
             "allocation_completion_date", "allocation_led"]

group_by_cols = ['project_award_year','project_project_#','project_project_manager',
                 'allocation_grant_recipient', 'allocation_implementing_agency',
                 'project_project_title', 'Percent_of_Award_Fully_Allocated','TIRCP_Award_Amount',
                 'allocation_components','project_ppno','allocation_phase',
                 "allocation_allocation_date",  "CON_Contract_Award_Date",
                 "Phase_Completion_Date", 'allocation_ea', 'allocation_project_id',]

sum_cols = ['allocation_allocation_amount','allocation_expended_amount','allocation_sb1_funding',
    'allocation_ggrf_funding']

max_cols = ['Percent_of_Allocation_Expended','Allocated_Before_July_31_2020']




In [None]:
#For table 2 in semi annual report
#Input project df 
def summary_SAR_table_two(df):
    #pivot
    df = (df
          .drop_duplicates()
          .groupby(['project_award_year'])
          .agg({'project_project_#':'count',
                'project_tircp_award_amount__$_':'sum', 
                'project_allocated_amount':'sum',
                'project_expended_amount':'sum'})
          .reset_index()
         )
    #renaming columns to match report
    df = (df.rename(columns = {'project_project_#':'Number_of_Awarded_Projects',
                               'project_tircp_award_amount__$_': 'Award_Amount',
                               'project_allocated_amount':'Amount_Allocated',
                               'project_expended_amount': 'Expended_Amount',
                               'project_award_year': 'Award_Year'})
         )
    #create percentages
    df['Expended_Percent_of_Awarded'] = (df['Expended_Amount']/df['Award_Amount'])
    df['Expended_Percent_of_Allocated'] = (df['Expended_Amount']/df['Amount_Allocated'])
    df['Percent_Allocated'] = (df['Amount_Allocated']/df['Award_Amount'])
    #transpose 
    df = df.set_index('Award_Year').T
    #grand totals for monetary columns
    list_to_add = ['Award_Amount','Amount_Allocated','Expended_Amount', 'Number_of_Awarded_Projects']
    df['Grand_Total']=df.loc[list_to_add, :].sum(axis=1)
    #grand total project_expended_amount of each monetary column to fill in percentages below.
    Exp = df.at['Expended_Amount','Grand_Total']
    Alloc = df.at['Amount_Allocated','Grand_Total']
    TIRCP = df.at['Award_Amount','Grand_Total']
    #filling in totals of percentages
    df.at['Expended_Percent_of_Awarded','Grand_Total'] = (Exp/TIRCP)
    df.at['Expended_Percent_of_Allocated','Grand_Total'] = (Exp/Alloc)
    df.at['Percent_Allocated','Grand_Total'] = (Alloc/TIRCP)
    #switching rows to correct order
    df = (df.reindex(['Number_of_Awarded_Projects',
                      'Award_Amount', 'Amount_Allocated',
                     'Percent_Allocated','Expended_Amount', 
                      'Expended_Percent_of_Awarded', 'Expended_Percent_of_Allocated'])
    )
    return df 


In [None]:
'''
def sar():
    #Load in raw sheets
    df_project = A1_data_prep.clean_project()
    df_allocation = A1_data_prep.clean_allocation()
    previous_sar = A1_data_prep.load_previous_sar()
    
    #Function for summary table portion of the report
    summary = summary_SAR_table_two(df_project)
    
    #Only keeping certain columns
    df_project = (df_project[project_cols])
    df_allocation =(df_allocation[allocation_cols])
    
    #Join the 2 dataframes
    m1 = df_allocation.merge(df_project, how = "left", 
                                 left_on = ["allocation_ppno", "allocation_award_year"],
                                 right_on = ["project_ppno", "project_award_year"])
    #drop duplicates
    m1 = m1.drop_duplicates() 
    
    #Fill in missing dates with a fake one so it'll show up in the group by 
    missing_date = pd.to_datetime('2100-01-01')
    for i in dates:
        m1[i] = (m1[i]
                     .fillna(missing_date)
                     .apply(pd.to_datetime)
                    )
    
    #Add new columns with percentages and a new column to flag whether an allocation date is 
    #AFTER  7-31-2020 then blank, if BEFORE 7-31-2020 then X
    m1 = m1.assign(
    Percent_of_Allocation_Expended = (m1['allocation_expended_amount']/
                                      m1['allocation_allocation_amount']),
    Percent_of_Award_Fully_Allocated = (m1['allocation_allocation_amount']/
                                        m1['project_tircp_award_amount__$_']),
    Allocated_Before_July_31_2020 =   m1.apply(lambda x: ' ' if x.allocation_allocation_date 
                                        > pd.Timestamp(2020, 7, 31, 0) else 'X', axis=1))
    
        
    #Filter out projects that are excluded 
    m1 = (m1[(m1.allocation_allocation_amount > 0 ) & (m1.Percent_of_Allocation_Expended < 0.99)]) 
    
    #Fill in null values based on datatype of each column
    m1 = m1.fillna(m1.dtypes.replace({'float64': 0.0, 'int64': 0}))
    
    #Rename cols 
    m1 = m1.rename(columns = {'allocation_led': 'Phase_Completion_Date',
         'project_tircp_award_amount__$_': 'TIRCP_Award_Amount',
         'allocation__3rd_party_award_date':'CON_Contract_Award_Date'})

    #Pivot
    df_pivoted =m1.groupby(group_by_cols).agg({**{e:'max' for e in max_cols}, **{e:'sum' for e in sum_cols}})
    
    #Apply styling to show difference between current SAR and previous SAR
    #https://stackoverflow.com/questions/17095101/compare-two-dataframes-and-output-their-differences-side-by-side
    #Reset index from dataframe above
    df_current = df_pivoted.reset_index() 
    df_all = pd.concat([df_current, previous_sar], keys=['Current_SAR', 'Previous_SAR'], axis =1)
    df_all = df_all.swaplevel(axis='columns')[df_current.columns[1:]]
    
    def highlight_diff(data, color='pink'):
        attr = 'background-color: {}'.format(color)
        other = data.xs('Current_SAR', axis='columns', level=-1)
        return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''),
                        index=data.index, columns=data.columns)
    
    df_highlighted = df_all.style.apply(highlight_diff, axis=None)
    
    #Save to GCS
    with pd.ExcelWriter(f"{GCS_FILE_PATH}Script_Semi_Annual_Report.xlsx") as writer:
        summary.to_excel(writer, sheet_name="Summary", index=True)
        df_pivoted.to_excel(writer, sheet_name="FY", index=True)
        df_current.to_excel(writer, sheet_name="Unpivoted_Current_Version", index=False)
        df_highlighted.to_excel(writer, sheet_name="highlighted")
    return df_current, df_pivoted, summary
'''

In [None]:
df1, df2, df3 = sar()

In [None]:
previous_sar = A1_data_prep.load_previous_sar()

In [None]:
def highlight_differences(df1, df2):
    df_current = df1.reset_index()
    df_all = pd.concat([df_current, previous_sar], keys=['Current_SAR', 'Previous_SAR'], axis =1)
    df_all = df_all.swaplevel(axis='columns')[df1.columns[1:]]
    def highlight_diff(data, color='pink'):
        attr = 'background-color: {}'.format(color)
        other = data.xs('Current_SAR', axis='columns', level=-1)
        return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''),
                        index=data.index, columns=data.columns)
    
    df_highlighted = df_all.style.apply(highlight_diff, axis=None)
    return df_highlighted 

In [None]:
test_function = highlight_differences(df1, previous_sar)
#test_function

In [None]:
#https://stackoverflow.com/questions/17095101/compare-two-dataframes-and-output-their-differences-side-by-side
df_all = pd.concat([df1, previous_sar], keys=['Current_SAR', 'Previous_SAR'], axis =1)

In [None]:
df_all = df_all.swaplevel(axis='columns')[df1.columns[1:]]

In [None]:
def highlight_diff(data, color='pink'):
    attr = 'background-color: {}'.format(color)
    other = data.xs('Current_SAR', axis='columns', level=-1)
    return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''),
                        index=data.index, columns=data.columns)

In [None]:
df_highlighted = df_all.style.apply(highlight_diff, axis=None)

## Tableau
* Maybe add geodataframe component?
* Project Size function not working, have to embed it in the entire function as opposed to outside. Fix.


def tableau():  
    df = A1_data_prep.clean_project()
    # Keeping only certain columns.
    df = df[['project_award_year','project_grant_recipient',
       'project_project_title', 'project_ppno', 'project_district',
       'project_technical_assistance_calitp__y_n_',
       'project_technical_assistance_fleet__y_n_',
       'project_technical_assistance_network_integration__y_n_',
       'project_technical_assistance_priority_population__y_n_',
       'project_total_project_cost', 'project_tircp_award_amount__$_',
       'project_allocated_amount', 'project_unallocated_amount',
       'project_expended_amount', 'project_award_cycle',
       'project_estimated_tircp_ghg_reductions',
       'project_cost_per_ghg_ton_reduced', 'project_increased_ridership',
       'project_service_integration', 'project_improve_safety',
       'project_project_readiness','project_county']]

    #Create new cols
    df = df.assign(
    Expended_Percent = (df["project_expended_amount"] / df["project_allocated_amount"]),
    Allocated_Percent = (df["project_allocated_amount"] / df["project_tircp_award_amount__$_"]),
    Unallocated_Amount = (df["project_tircp_award_amount__$_"] - df["project_allocated_amount"]),
    Projects_Funded_Percent = (df['project_tircp_award_amount__$_']/df['project_total_project_cost'])
    )
   
    # filling in for 0's
    new_cols_list = ["Expended_Percent", "Allocated_Percent", "Unallocated_Amount", 'Projects_Funded_Percent'] 
    df[new_cols_list] = df[new_cols_list].fillna(0)
    
    #Replace distircts & counties with their full names 
    df['project_district'] = df['project_district'].replace(crosswalks.full_ct_district)
    df['project_county'] = df['project_county'].replace(crosswalks.full_county)
    
    #Apply functions
    #Categorize projects into expended % bins
    df["Expended_Percent_Group"] = df.apply(lambda x: A2_tableau.expended_percent(x), axis=1)
    
    #Categorize projects whether they are ahead/behind/0 expenditures/etc
    df["Progress"] = df.apply(A2_tableau.progress, axis=1)
    
    #Categorize projects whether they are large/small/med based on TIRCPamount
    
    #Rename TIRCP column to something cleaner
    df= df.rename(columns={'project_tircp_award_amount__$_': "tircp"})
    # Which projects are large,small, medium
    p75 = df.tircp.quantile(0.75).astype(float)
    p50 = df.tircp.quantile(0.50).astype(float)
    p25 = df.tircp.quantile(0.25).astype(float)
    
    def project_size (row):
        if ((row.tircp > 0) and (row.tircp < p25)):
             return "Small"
        elif ((row.tircp > p25) and (row.tircp < p50)):
             return "Medium"
        elif (row.tircp > p50):
             return "Large"
        elif (row.tircp == 0):
            return "$0 recorded for TIRCP"
        else:
            return "Medium"
        
    df["Project_Category"] = df.apply(lambda x: project_size(x), axis=1)
    
    #Clean up column names
    df.columns = (df.columns
                  .str.replace('[_]', ' ')
                  .str.replace('project','')
                  .str.title()
                  .str.strip()
                 )
    ### GCS ###
    with pd.ExcelWriter(f"{GCS_FILE_PATH}Script_Tableau_Sheet.xlsx") as writer:
        df.to_excel(writer, sheet_name="Data", index=False)
    return df


In [None]:
#nb_tableau = tableau()

In [None]:
#nb_tableau.shape

In [None]:
#nb_tableau[['Tircp','Expended Amount','Ppno','Expended Percent','Progress','Project Category']]

In [None]:
script_tableau = A2_tableau.tableau()

In [None]:
#script_tableau.shape

In [None]:
#nb_tableau.equals(script_tableau)