## GGRF Accounting Analysis
* Goal: Narrow down projects from Cycle 3 and beyond that need intervention with allocating funds from GGRF 
* Project ID as reference: 18000321

In [127]:
import A1_data_prep
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [128]:
# Function to clean up project IDS
def clean_project_ids(df, project_id_col: str):
    # Cast to string
    df[project_id_col] = df[project_id_col].astype("str")

    # Remove extra zeroes that may appear in front of ID
    # https://stackoverflow.com/questions/45923189/remove-first-character-from-pandas-column-if-the-number-1
    df[project_id_col] = df[project_id_col].apply(
        lambda x: x[2:] if x.startswith("0") else x
    )

    # Slice to 8 digits if it exceeds that number
    df[project_id_col] = df[project_id_col].str.slice(start=0, stop=8)
    return df

### Allocation Prep

In [129]:
def prep_allocation():
    alloc_subset = [
    "allocation_award_year",
    "allocation_project_id",
    "allocation_phase",
    "allocation_sb1_funding",
    "allocation_ggrf_funding",]
    
    df = A1_data_prep.clean_allocation()[alloc_subset]
    
    # Filter out project ID that are none and projects starting in 2018 
    df = (df[
    ((df.allocation_project_id != "None") & (df.allocation_award_year >= 2018))]).reset_index(drop = True)
    
    # Clean up Project ID
    df = clean_project_ids(df, "allocation_project_id")
    
    # Group multiple project ID and phase into one row: sum up SB1 and GGRF
    df = df.groupby(["allocation_award_year",'allocation_project_id','allocation_phase']).agg({'allocation_sb1_funding':'sum', 'allocation_ggrf_funding':'sum'}).reset_index()
    
    # Sum up SB1 and GGRF
    df['Sum of Allocation Amount'] = df['allocation_sb1_funding'] + df['allocation_ggrf_funding']
    
    return df

In [130]:
alloc = prep_allocation()

In [131]:
# Original allocation sheet without "None" Project IDs filtered out
alloc_original = A1_data_prep.clean_allocation()

In [133]:
alloc_original.shape

(370, 53)

In [134]:
len(alloc), alloc.allocation_project_id.nunique()

(200, 200)

### Expenditures Prep 

In [135]:
def prep_expenditures():
    # Load in original sheet
    df = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}3010_3020 Expenditure for fund 0046.xls",
        sheet_name="Download",
    )

    # First 10 columns or so are not data, drop them
    df = df.iloc[9:].reset_index(drop=True)

    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    df = to_snakecase(df)
    
    # Only grab records from 2018 and beyond
    df = (df.loc[df["fy"] > "2017"]).reset_index()
    
    # Subset to only columns of interest
    subset = ["fy", "fund", "appr_unit", "project", "project_name", "tot_exp"]
    df = df[subset]
    
    # Clean up project ID
    df = clean_project_ids(df, "project")
    
    return df

In [136]:
# Search for the appr_unit code that corresponds with either SB1/GGRF
# Aggregate the results by project ID and name
def split_ggrf_sb1_expenditures(df, appr_unit_wanted: str, ):
    
    # Search through for the appr_unit
    df = df.loc[df.appr_unit.str.contains(appr_unit_wanted)].reset_index(drop=True)
    
    # Group by project ID 
    df = df.groupby(['project',]).agg({'tot_exp':'sum'}).reset_index()
    
    # Rename total expenditures column with the appr_unit
    df = df.rename(columns = {'tot_exp': f'{appr_unit_wanted}_tot_exp'}) 
    
    return df

In [137]:
# Load expenditures, split df by GGRF and Sb1, and join them bacak together for the final dataframe.
def aggregate_expenditures():
    df = prep_expenditures()
    
    # Find and sum up GGRF funds
    df_ggrf =  split_ggrf_sb1_expenditures(df, '301R')
    
    # Find and sum up SB1 funds
    df_sb1 =  split_ggrf_sb1_expenditures(df, '101')
    
    m1 = pd.merge(
    df_ggrf,
    df_sb1,
    how="outer",
    on = ["project"],
    indicator=True,)
    
    # Create sum column of both funds
    m1['Sum of Expenditure'] = m1['301R_tot_exp'] + m1['101_tot_exp'] 
    return m1 

In [138]:
expenditure =  aggregate_expenditures()

In [139]:
expenditure._merge.value_counts()

right_only    80
both          52
left_only     40
Name: _merge, dtype: int64

#### Double check project ids w/ original dataframe before groupby

In [140]:
og_expenditure = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}3010_3020 Expenditure for fund 0046.xls",
        sheet_name="Download",
    )

In [141]:
og_expenditure = og_expenditure.iloc[9:].reset_index(drop=True)

In [142]:
og_expenditure.columns =  og_expenditure.iloc[0]

In [143]:
og_expenditure =  og_expenditure.drop(og_expenditure.index[0]).reset_index(drop=True)

In [144]:
og_expenditure = (og_expenditure.loc[og_expenditure["FY"] > "2017"]).reset_index()

In [145]:
og_expenditure.FY.value_counts()

2022    279
2021    222
2020    208
2023    189
2018    180
2019    172
Name: FY, dtype: int64

In [146]:
og_expenditure = og_expenditure[
    og_expenditure["Appr Unit"].str.contains("101") | og_expenditure["Appr Unit"].str.contains("301R")
]

In [147]:
og_expenditure = clean_project_ids(og_expenditure, "Project")

In [148]:
len(expenditure), expenditure.project.nunique(), len(og_expenditure), og_expenditure.Project.nunique()

(172, 172, 423, 172)

In [149]:
# check to see if 2 columns are the same
project_id_grouped = set(expenditure.project.unique().tolist())
project_id_og = set( og_expenditure.Project.unique().tolist())
project_id_og - project_id_grouped

set()

### Project Status Prep

In [150]:
def prep_project_status():
    df = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}Project status for fund 0046 as of 12-5-22.xls",
        sheet_name="Download",
    )
    
    # First few rows are not data
    df = df.iloc[6:].reset_index(drop=True)
    
    # Cast first row as column names
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Coerce certain columns to numeric
    df[["Billed", "Reimbursements"]] = df[["Billed", "Reimbursements"]].apply(
        pd.to_numeric, errors="coerce"
    )
    # Group project id so the same ones will be on one line
    df = (
        df.groupby(["Project"])
        .agg({"Billed": "sum", "Reimbursements": "sum"})
        .reset_index()
    )

    df = to_snakecase(df)
    
    # Clean project names
    df = clean_project_ids(df, "project")
    
    return df

In [151]:
project_status = prep_project_status()

In [152]:
project_status.project.nunique()

52

#### Double check project ids w/ original dataframe before groupby

In [153]:
project_status_og = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}Project status for fund 0046 as of 12-5-22.xls",
        sheet_name="Download",
    )

In [154]:
# First few rows are not data
project_status_og = project_status_og.iloc[6:].reset_index(drop=True)
    
# Cast first row as column names
project_status_og.columns = project_status_og.iloc[0]

# Drop the first row as they are now column names
project_status_og= project_status_og.drop(project_status_og.index[0]).reset_index(drop=True)

In [155]:
project_status_og.Project.nunique()

52

## Final Analysis - DRAFT
### Merge 1

In [156]:
m1 = pd.merge(
    alloc,
    expenditure.drop(columns = ['_merge']),
    how="left",
    left_on="allocation_project_id",
    right_on="project",
    indicator=True,
)

In [157]:
m1._merge.value_counts()

both          110
left_only      90
right_only      0
Name: _merge, dtype: int64

In [158]:
m1 = m1.rename(columns = {'_merge':'Allocation_Expenditure_Merge'})

In [159]:
m1.Allocation_Expenditure_Merge = m1.Allocation_Expenditure_Merge.replace(
    {'both':'Project ID in TIRCP Tracking and 3010_3020 Expenditure for fund 0046',
     'left_only':'Project ID only in TIRCP Tracking'})

In [160]:
m1.Allocation_Expenditure_Merge.value_counts()

Project ID in TIRCP Tracking and 3010_3020 Expenditure for fund 0046    110
Project ID only in TIRCP Tracking                                        90
Name: Allocation_Expenditure_Merge, dtype: int64

In [161]:
m1 = m1.drop(columns = ['project'])

In [162]:
len(m1), len(alloc)

(200, 200)

### Merge 2

In [163]:
m2 = pd.merge(
    m1,
    project_status,
    how="left",
    left_on="allocation_project_id",
    right_on="project",
    indicator=True,
)

In [164]:
m2._merge.value_counts()

left_only     150
both           50
right_only      0
Name: _merge, dtype: int64

In [165]:
m2 = m2.drop(columns = ['project'])

In [166]:
m2 = m2.rename(columns = {'_merge':'Allocation_Project_Status_Merge'})

In [167]:
m2.Allocation_Project_Status_Merge = m2.Allocation_Project_Status_Merge.replace(
    {'both':'Project ID in TIRCP Tracking and Project_Status',
     'left_only':'Project ID only in TIRCP Tracking'})

In [168]:
(m2.isna().sum())/len(m2)

allocation_award_year             0.00
allocation_project_id             0.00
allocation_phase                  0.00
allocation_sb1_funding            0.00
allocation_ggrf_funding           0.00
Sum of Allocation Amount          0.00
301R_tot_exp                      0.73
101_tot_exp                       0.46
Sum of Expenditure                0.74
Allocation_Expenditure_Merge      0.00
billed                            0.75
reimbursements                    0.75
Allocation_Project_Status_Merge   0.00
dtype: float64

### Clean up

In [169]:
m2 = A1_data_prep.clean_up_columns(m2).fillna(0)

In [170]:
m2 = m2.rename(columns = 
    {'Id':'Project ID',
     'Sb1 Funding': 'Sum of Sb1 Funding', 
     'Ggrf Funding': 'Sum of GGRF Funding',
     '101 Tot Exp':"Sum of SB1 (0046, xx101 Expenditure)",
     '301R Tot Exp': "Sum of GGRF (0046, xx301R expenditure)",
     'Billed':'Sum Billed',
     'Reimbursements':'Sum of Reimbursements'})


In [171]:
m2['Remaining Allocation'] = m2['Sum Of Allocation Amount'] - m2['Sum Of Expenditure']

In [172]:
# Tag if a project's desc has a SHS keyword/SHS keywords or not
def keywords(row):
    
    if row["Sum Of Allocation Amount"] == row["Sum of GGRF Funding"]:
        return "100% GGRF/0046R: no action needed"
    
    elif row["Sum Of Allocation Amount"] == row["Sum of Sb1 Funding"]:
        return "100% SB1/0046: no action needed"
    
    elif (row['Allocation Expenditure Merge'] == 'Project ID only in TIRCP Tracking') & (row['Allocation Project Status Merge'] == 'Project ID only in TIRCP Tracking'): 
        return "Project ID only found in TIRCP Tracking 2.0"
    
    elif row["Sum Of Allocation Amount"] == 0: 
        return "No monetary info from TIRCP Tracking 2.0"
    
    elif row["Remaining Allocation"] == 0: 
        return "Fully expended: no action needed"
    
    elif row["Remaining Allocation"] >= 400000: 
        return "More than $400k: need to correct funding"
    
    elif row["Remaining Allocation"] < 0: 
        return "Negative Remaining Allocation"
    
    elif row["Remaining Allocation"] <= 400000: 
        return "Less than $400k: no action needed"
        
    # Everything else is not enough info
    else:
        return "Manual Comment"
    

In [173]:
# Apply the function
m2["Comments"] = m2.apply(keywords, axis=1)

In [174]:
m2.Comments.value_counts()

Project ID only found in TIRCP Tracking 2.0    86
More than $400k: need to correct funding       52
Less than $400k: no action needed              20
100% SB1/0046: no action needed                18
Fully expended: no action needed               17
Negative Remaining Allocation                   4
100% GGRF/0046R: no action needed               3
Name: Comments, dtype: int64

In [175]:
right_col_order = ['Award Year', 'Project ID', 'Phase', 'Sum of Sb1 Funding',
       'Sum of GGRF Funding', 'Sum Of Allocation Amount',
       'Sum of GGRF (0046, xx301R expenditure)',
       'Sum of SB1 (0046, xx101 Expenditure)', 'Sum Of Expenditure',
       'Sum Billed', 'Sum of Reimbursements',
       'Remaining Allocation',  'Comments',  'Allocation Project Status Merge', 'Allocation Expenditure Merge', ]

In [176]:
m2 = m2[right_col_order]

In [177]:
200-86

114

In [180]:
# Export  
# m2.to_excel(f"{A1_data_prep.GCS_FILE_PATH}accounting_analysis.xlsx", sheet_name="accounting_analysis", index=False)