## GGRF Accounting Analysis
* Goal: Narrow down projects that need intervention with allocating funds from GGRF 
* Project ID as reference: 18000321

In [1]:
import A1_data_prep
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [2]:
# Function to clean up project IDS
def clean_project_ids(df, project_id_col: str):
    # Cast to string
    df[project_id_col] = df[project_id_col].astype("str")

    # Remove \n
    #df[project_id_col] = df[project_id_col].str.replace("\n", "")

    # Remove extra zeroes that may appear in front of ID
    # https://stackoverflow.com/questions/45923189/remove-first-character-from-pandas-column-if-the-number-1
    df[project_id_col] = df[project_id_col].apply(
        lambda x: x[2:] if x.startswith("0") else x
    )

    # Slice to 8 digits if it exceeds that number
    df[project_id_col] = df[project_id_col].str.slice(start=0, stop=8)
    return df

### Allocation Prep

In [3]:
def prep_allocation():
    alloc_subset = [
    "allocation_project_id",
    "allocation_phase",
    "allocation_sb1_funding",
    "allocation_ggrf_funding",]
    
    df = A1_data_prep.clean_allocation()[alloc_subset]
    
    # Filter out project ID that are none
    df = (df.loc[df.allocation_project_id != "None"]).reset_index(drop=True)
    
    # Clean up Project ID
    df = clean_project_ids(df, "allocation_project_id")
    
    # Group multiple project ID and phase into one row: sum up SB1 and GGRF
    df = df.groupby(['allocation_project_id','allocation_phase']).agg({'allocation_sb1_funding':'sum', 'allocation_ggrf_funding':'sum'}).reset_index()
    
    # Sum up SB1 and GGRF
    df['Sum of Allocation Amount'] = df['allocation_sb1_funding'] + df['allocation_ggrf_funding']
    
    return df

In [4]:
alloc = prep_allocation()

In [5]:
# Original allocation sheet without "None" Project IDs filtered out
alloc_original = A1_data_prep.clean_allocation()

In [6]:
alloc_original.shape

(370, 53)

In [7]:
alloc.shape

(247, 5)

In [8]:
alloc.allocation_project_id.nunique()

246

### Expenditures Prep 

In [9]:
def prep_expenditures():
    # Load in original sheet
    df = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}3010_3020 Expenditure for fund 0046.xls",
        sheet_name="Download",
    )

    # First 10 columns or so are not data, drop them
    df = df.iloc[9:].reset_index(drop=True)

    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    df = to_snakecase(df)
    
    # Only grab records from 2017 and beyond
    df = (df.loc[df["fy"] > "2017"]).reset_index()
    
    # Subset to only columns of interest
    subset = ["fy", "fund", "appr_unit", "project", "project_name", "tot_exp"]
    df = df[subset]
    
    # Clean up project ID
    df = clean_project_ids(df, "project")
    
    return df

In [10]:
# Search for the appr_unit code that corresponds with either SB1/GGRF
# Aggregate the results by project ID and name
def split_ggrf_sb1_expenditures(df, appr_unit_wanted: str, ):
    
    # Search through for the appr_unit
    df = df.loc[df.appr_unit.str.contains(appr_unit_wanted)].reset_index(drop=True)
    
    # Group by project ID 
    df = df.groupby(['project',]).agg({'tot_exp':'sum'}).reset_index()
    
    # Rename total expenditures column with the appr_unit
    df = df.rename(columns = {'tot_exp': f'{appr_unit_wanted}_tot_exp'}) 
    
    return df

In [11]:
# Load expenditures, split df by GGRF and Sb1, and join them bacak together for the final dataframe.
def aggregate_expenditures():
    df = prep_expenditures()
    
    # Find and sum up GGRF funds
    df_ggrf =  split_ggrf_sb1_expenditures(df, '301R')
    
    # Find and sum up SB1 funds
    df_sb1 =  split_ggrf_sb1_expenditures(df, '101')
    
    m1 = pd.merge(
    df_ggrf,
    df_sb1,
    how="outer",
    on = ["project"],
    indicator=True,)
    
    # Create sum column of both funds
    m1['Sum of Expenditure'] = m1['301R_tot_exp'] + m1['101_tot_exp'] 
    return m1 

In [12]:
expenditure =  aggregate_expenditures()

In [13]:
expenditure._merge.value_counts()

right_only    80
both          52
left_only     40
Name: _merge, dtype: int64

#### Double check project ids w/ original dataframe before groupby

In [14]:
og_expenditure = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}3010_3020 Expenditure for fund 0046.xls",
        sheet_name="Download",
    )

In [15]:
og_expenditure = og_expenditure.iloc[9:].reset_index(drop=True)

In [16]:
og_expenditure.columns =  og_expenditure.iloc[0]

In [17]:
og_expenditure =  og_expenditure.drop(og_expenditure.index[0]).reset_index(drop=True)

In [18]:
og_expenditure = (og_expenditure.loc[og_expenditure["FY"] > "2017"]).reset_index()

In [19]:
og_expenditure = og_expenditure[
    og_expenditure["Appr Unit"].str.contains("101") | og_expenditure["Appr Unit"].str.contains("301R")
]

In [20]:
og_expenditure = clean_project_ids(og_expenditure, "Project")

In [21]:
len(expenditure), expenditure.project.nunique(), len(og_expenditure), og_expenditure.Project.nunique()

(172, 172, 423, 172)

In [22]:
# check to see if 2 columns are the same
project_id_grouped = set(expenditure.project.unique().tolist())
project_id_og = set( og_expenditure.Project.unique().tolist())
project_id_og - project_id_grouped

set()

### Project Status Prep

In [23]:
def prep_project_status():
    df = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}Project status for fund 0046 as of 12-5-22.xls",
        sheet_name="Download",
    )
    
    # First few rows are not data
    df = df.iloc[6:].reset_index(drop=True)
    
    # Cast first row as column names
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Coerce certain columns to numeric
    df[["Billed", "Reimbursements"]] = df[["Billed", "Reimbursements"]].apply(
        pd.to_numeric, errors="coerce"
    )
    # Group project id so the same ones will be on one line
    df = (
        df.groupby(["Project"])
        .agg({"Billed": "sum", "Reimbursements": "sum"})
        .reset_index()
    )

    df = to_snakecase(df)
    
    # Clean project names
    df = clean_project_ids(df, "project")
    
    return df

In [24]:
project_status = prep_project_status()

In [25]:
project_status.project.nunique()

52

## DRAFT
### Merge 1

In [26]:
m1 = pd.merge(
    alloc,
    expenditure.drop(columns = ['_merge']),
    how="left",
    left_on="allocation_project_id",
    right_on="project",
    indicator=True,
)

In [27]:
m1._merge.value_counts()

both          143
left_only     104
right_only      0
Name: _merge, dtype: int64

In [28]:
len(m1), len(alloc)

(247, 247)

### Merge 2

In [29]:
m2 = pd.merge(
    m1.drop(columns = ["_merge"]),
    project_status,
    how="left",
    on = "project",
    indicator=True,
)

In [30]:
m2._merge.value_counts()

left_only     196
both           51
right_only      0
Name: _merge, dtype: int64

In [31]:
m2 = m2.drop(columns = ['_merge', 'project'])

In [32]:
(m2.isna().sum())/len(m2)

allocation_project_id      0.00
allocation_phase           0.00
allocation_sb1_funding     0.00
allocation_ggrf_funding    0.00
Sum of Allocation Amount   0.00
301R_tot_exp               0.65
101_tot_exp                0.56
Sum of Expenditure         0.79
billed                     0.79
reimbursements             0.79
dtype: float64

### Clean up

In [33]:
m2 = A1_data_prep.clean_up_columns(m2).fillna(0)

In [34]:
m2['Remaining Allocation'] = m2['Sum Of Allocation Amount'] - m2['Sum Of Expenditure']

In [35]:
# Tag if a project's desc has a SHS keyword/SHS keywords or not
def keywords(row):
        
    if row["Remaining Allocation"] == 0: 
        return "Fully expended: no action needed"
    
    elif row["Sum Of Allocation Amount"] == 0: 
        return "No monetary info from TIRCP Tracking 2.0"
    
    elif row["Sum Of Allocation Amount"] == row["Ggrf Funding"]:
        return "100% GGRF/0046R No action needed"
    
    elif row["Sum Of Allocation Amount"] == row["Sb1 Funding"]:
        return "100% SB1/0046 No action needed"
    
    elif row["Remaining Allocation"] >= 400000: 
        return "More than $400k: need to correct funding"
          
    elif row["Remaining Allocation"] <= 400000: 
        return "Less than $400k: no action needed"
        
    # Everything else is not enough info
    else:
        return "Manual Comment"
    

In [36]:
# Apply the function
m2["Comments"] = m2.apply(keywords, axis=1)
    

In [37]:
m2.Comments.value_counts()

More than $400k: need to correct funding    123
100% GGRF/0046R No action needed             45
Less than $400k: no action needed            39
Fully expended: no action needed             21
100% SB1/0046 No action needed               19
Name: Comments, dtype: int64

In [38]:
monetary_cols = ['Sb1 Funding', 'Ggrf Funding',
       'Sum Of Allocation Amount', '301R Tot Exp', '101 Tot Exp',
       'Sum Of Expenditure', 'Billed', 'Reimbursements',
       'Remaining Allocation']

In [39]:
m2 = A1_data_prep.currency_format(m2, monetary_cols)