## GGRF Accounting Analysis
* Goal: Narrow down projects from Cycle 3 and beyond that need intervention with allocating funds from GGRF 
* Project ID as reference: 18000321

In [37]:
import A1_data_prep
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [38]:
# Function to clean up project IDS
def clean_project_ids(df, project_id_col: str):
    # Cast to string
    df[project_id_col] = df[project_id_col].astype("str")

    # Remove extra zeroes that may appear in front of ID
    # https://stackoverflow.com/questions/45923189/remove-first-character-from-pandas-column-if-the-number-1
    df[project_id_col] = df[project_id_col].apply(
        lambda x: x[2:] if x.startswith("0") else x
    )

    # Slice to 8 digits if it exceeds that number
    df[project_id_col] = df[project_id_col].str.slice(start=0, stop=8)
    return df

### Allocation Prep

In [39]:
# Original allocation sheet without "None" Project IDs filtered out
# alloc_original = A1_data_prep.clean_allocation()

In [40]:
# alloc_original.shape

In [41]:
def prep_allocation():
    alloc_subset = [
        "allocation_project_#",
        "allocation_ppno",
        "allocation_components",
        "allocation_award_year",
        "allocation_ea",
        "allocation_project_id",
        "allocation_phase",
        "allocation_sb1_funding",
        "allocation_ggrf_funding",
    ]

    df = A1_data_prep.clean_allocation()[alloc_subset]

    # Filter out project ID that are none and projects starting in 2018
    df = (
        df[((df.allocation_project_id != "None") & (df.allocation_award_year >= 2018))]
    ).reset_index(drop=True)

    # Clean up Project ID
    df = clean_project_ids(df, "allocation_project_id")

    group_by_cols = [
        "allocation_project_id",
        "allocation_ppno",
        "allocation_ea",
        "allocation_project_#",
        "allocation_components",
        "allocation_award_year",
        "allocation_phase",
    ]

    # Group multiple project ID and phase into one row: sum up SB1 and GGRF
    df = (
        df.groupby(group_by_cols)
        .agg({"allocation_sb1_funding": "sum", "allocation_ggrf_funding": "sum"})
        .reset_index()
    )
    
    # Sum up SB1 and GGRF
    df["Sum of Allocation Amount"] = (
        df["allocation_sb1_funding"] + df["allocation_ggrf_funding"]
    )

    return df

In [42]:
# prep_alloc = prep_allocation()

In [43]:
# len(prep_alloc)

In [44]:
# project_id_more_than_1 = ["19000021", "19000073", "19000120", "20000194"]

In [45]:
# alloc[alloc.allocation_project_id.isin(project_id_more_than_1)]

In [46]:
def final_allocation():

    # Prepared allocation sheet
    alloc = prep_allocation()

    # Subsetted projects tab to get project title
    project = A1_data_prep.clean_project()[
        [
            "project_award_year",
            "project_project_#",
            "project_project_title",
            "project_ppno",
        ]
    ]

    # Filter project sheet for ones cycle 3 +
    project = (project.loc[project.project_award_year > 2017]).reset_index(drop=True)

    # Merge allocation and project on PPNO
    m1 = pd.merge(
        alloc,
        project,
        how="left",
        left_on=["allocation_ppno", "allocation_project_#"],
        right_on=["project_ppno", "project_project_#"],
        indicator=True,
    )

    m1 = m1.drop(
        columns=[
            "project_award_year",
            "project_project_#",
            "project_ppno",
        ]
    )

    return m1

In [47]:
alloc = final_allocation()

In [48]:
# check to see if project ID match across all the original vs. grouped allocation data sets
# project_id_merged = set(alloc.allocation_project_id.unique().tolist())
# project_id_og = set(prep_alloc.allocation_project_id.unique().tolist())
# project_id_og - project_id_merged
# project_id_merged - project_id_og

In [49]:
# len(alloc) == len(prep_alloc)

### Expenditures Prep 

In [50]:
def prep_expenditures():
    # Load in original sheet
    df = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}3010_3020 Expenditure for fund 0046.xls",
        sheet_name="Download",
    )

    # First 10 columns or so are not data, drop them
    df = df.iloc[9:].reset_index(drop=True)

    # The first row contains column names - update it to the column
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    df = to_snakecase(df)

    # Only grab records from 2018 and beyond
    df = (df.loc[df["fy"] > "2017"]).reset_index()

    # Subset to only columns of interest
    subset = ["fy", "fund", "appr_unit", "project", "project_name", "tot_exp"]
    df = df[subset]

    # Clean up project ID
    df = clean_project_ids(df, "project")

    return df

In [51]:
# Search for the appr_unit code that corresponds with either SB1/GGRF
# Aggregate the results by project ID and name
def split_ggrf_sb1_expenditures(
    df,
    appr_unit_wanted: str,
):

    # Search through for the appr_unit
    df = df.loc[df.appr_unit.str.contains(appr_unit_wanted)].reset_index(drop=True)

    # Group by project ID
    df = (
        df.groupby(
            [
                "project",
            ]
        )
        .agg({"tot_exp": "sum"})
        .reset_index()
    )

    # Rename total expenditures column with the appr_unit
    df = df.rename(columns={"tot_exp": f"{appr_unit_wanted}_tot_exp"})

    return df

In [52]:
# Load expenditures, split df by GGRF and Sb1, and join them bacak together for the final dataframe.
def aggregate_expenditures():
    df = prep_expenditures()

    # Find and sum up GGRF funds
    df_ggrf = split_ggrf_sb1_expenditures(df, "301R")

    # Find and sum up SB1 funds
    df_sb1 = split_ggrf_sb1_expenditures(df, "101")

    m1 = pd.merge(
        df_ggrf,
        df_sb1,
        how="outer",
        on=["project"],
        indicator=True,
    )

    return m1

In [53]:
expenditure = aggregate_expenditures()

In [54]:
expenditure._merge.value_counts()

right_only    80
both          52
left_only     40
Name: _merge, dtype: int64

#### Double check project ids w/ original dataframe before groupby

In [55]:
og_expenditure = pd.read_excel(
    f"{A1_data_prep.GCS_FILE_PATH}3010_3020 Expenditure for fund 0046.xls",
    sheet_name="Download",
)

In [56]:
og_expenditure = og_expenditure.iloc[9:].reset_index(drop=True)

In [57]:
og_expenditure.columns = og_expenditure.iloc[0]

In [58]:
og_expenditure = og_expenditure.drop(og_expenditure.index[0]).reset_index(drop=True)

In [59]:
og_expenditure = (og_expenditure.loc[og_expenditure["FY"] > "2017"]).reset_index()

In [60]:
og_expenditure.FY.value_counts()

2022    279
2021    222
2020    208
2023    189
2018    180
2019    172
Name: FY, dtype: int64

In [61]:
og_expenditure = og_expenditure[
    og_expenditure["Appr Unit"].str.contains("101")
    | og_expenditure["Appr Unit"].str.contains("301R")
]

In [62]:
og_expenditure = clean_project_ids(og_expenditure, "Project")

In [63]:
len(expenditure), expenditure.project.nunique(), len(
    og_expenditure
), og_expenditure.Project.nunique()

(172, 172, 423, 172)

In [64]:
# check to see if 2 columns are the same
project_id_grouped = set(expenditure.project.unique().tolist())
project_id_og = set(og_expenditure.Project.unique().tolist())
project_id_og - project_id_grouped

set()

### Project Status Prep

In [65]:
def prep_project_status():
    df = pd.read_excel(
        f"{A1_data_prep.GCS_FILE_PATH}Project status for fund 0046 as of 12-5-22.xls",
        sheet_name="Download",
    )

    # First few rows are not data
    df = df.iloc[6:].reset_index(drop=True)

    # Cast first row as column names
    df.columns = df.iloc[0]

    # Drop the first row as they are now column names
    df = df.drop(df.index[0]).reset_index(drop=True)

    # Coerce monetary columns to numeric
    df[["Billed", "Reimbursements"]] = df[["Billed", "Reimbursements"]].apply(
        pd.to_numeric, errors="coerce"
    )
    # Group project id so the same ones will be on one line
    df = (
        df.groupby(["Project"])
        .agg({"Billed": "sum", "Reimbursements": "sum"})
        .reset_index()
    )

    df = to_snakecase(df)

    # Clean project names
    df = clean_project_ids(df, "project")

    return df

In [66]:
project_status = prep_project_status()

In [67]:
project_status.project.nunique()

52

#### Double check project ids w/ original dataframe before groupby

In [68]:
project_status_og = pd.read_excel(
    f"{A1_data_prep.GCS_FILE_PATH}Project status for fund 0046 as of 12-5-22.xls",
    sheet_name="Download",
)

In [69]:
# First few rows are not data
project_status_og = project_status_og.iloc[6:].reset_index(drop=True)

# Cast first row as column names
project_status_og.columns = project_status_og.iloc[0]

# Drop the first row as they are now column names
project_status_og = project_status_og.drop(project_status_og.index[0]).reset_index(
    drop=True
)

In [70]:
project_status_og.Project.nunique()

52

## Final Analysis

In [71]:
def merge_allocation_expenditures():
    
    alloc = final_allocation()
    
    expenditure = aggregate_expenditures()
    
    m1 = pd.merge(
        alloc.drop(columns=["_merge"]),
        expenditure.drop(columns=["_merge"]),
        how="left",
        left_on="allocation_project_id",
        right_on="project",
        indicator=True,
    )
    
    m1 = m1.rename(columns={"_merge": "Allocation_Expenditure_Merge"})
    
    m1.Allocation_Expenditure_Merge = m1.Allocation_Expenditure_Merge.replace(
    {
        "both": "Project ID in TIRCP Tracking and 3010_3020 Expenditure for fund 0046",
        "left_only": "Project ID only in TIRCP Tracking",
    })
    
    m1 = m1.drop(columns=["project"])
    
    m1 = m1.fillna(0)
    
    return m1

In [72]:
# m1_function = merge_allocation_expenditures()

In [73]:
# len(alloc), len(m1_function)

In [74]:
def merge2_project_status():
    
    m1 = merge_allocation_expenditures()
    
    project_status = prep_project_status()
    
    m2 = pd.merge(
    m1,
    project_status,
    how="left",
    left_on="allocation_project_id",
    right_on="project",
    indicator=True,)
    
    m2 = m2.rename(columns={"_merge": "Allocation_Project_Status_Merge"})
    
    m2.Allocation_Project_Status_Merge = m2.Allocation_Project_Status_Merge.replace(
    {
        "both": "Project ID in TIRCP Tracking and Project_Status",
        "left_only": "Project ID only in TIRCP Tracking",
    })
    
    m2 = m2.drop(columns=["project"])
    
    return m2

In [75]:
m2 = merge2_project_status()

In [76]:
(m2.isna().sum()) / len(m2)

allocation_project_id             0.00
allocation_ppno                   0.00
allocation_ea                     0.00
allocation_project_#              0.00
allocation_components             0.00
allocation_award_year             0.00
allocation_phase                  0.00
allocation_sb1_funding            0.00
allocation_ggrf_funding           0.00
Sum of Allocation Amount          0.00
project_project_title             0.00
301R_tot_exp                      0.00
101_tot_exp                       0.00
Allocation_Expenditure_Merge      0.00
billed                            0.74
reimbursements                    0.74
Allocation_Project_Status_Merge   0.00
dtype: float64

### Final Clean Up

In [99]:
# Tag if a project's desc has a SHS keyword/SHS keywords or not
def comments(row):

    if (row["Allocation Expenditure Merge"] == "Project ID only in TIRCP Tracking") & (
        row["Allocation Project Status Merge"] == "Project ID only in TIRCP Tracking"
    ):
        return "No expenditures: correct in AMS"
   
    elif row["Sum Of Allocation Amount"] == 0:
        return "Component Split: correct in AMS"
    
    elif row["Sum Of Allocation Amount"] == row["Sum of GGRF Funding"]:
        return "100% GGRF/0046R: no action needed"

    elif row["Sum Of Allocation Amount"] == row["Sum of Sb1 Funding"]:
        return "100% SB1/0046: no action needed"

    elif row["Remaining Allocation"] == 0:
        return "Fully expended: no action needed"
    
    elif row["Remaining Allocation"] < 0:
        return "Negative Remaining Allocation"

    elif row["Remaining Allocation"] >= 400000:
        return "More than $400k: need to correct funding"

    elif row["Remaining Allocation"] <= 400000:
        return "Less than $400k: no action needed"

    # Everything else is not enough info
    else:
        return "Manual Comment"

In [100]:
right_col_order = [
    "Award Year",
    "#",
    "Title",
    "Project ID",
    "Ea",
    "Components",
    "Phase",
    "Sum of Sb1 Funding",
    "Sum of GGRF Funding",
    "Sum Of Allocation Amount",
    "Sum of GGRF (0046, xx301R expenditure)",
    "Sum of SB1 (0046, xx101 Expenditure)",
    "Sum of Expenditure",
    "Sum Billed",
    "Sum of Reimbursements",
    "Remaining Allocation",
    "Allocation Expenditure Merge",
    "Allocation Project Status Merge",
]

In [114]:
def final_accounting_analysis():
    
    df = merge2_project_status()
    
    # Clean up columns
    df = A1_data_prep.clean_up_columns(df)
    
    df = df.rename(
    columns={
        "Id": "Project ID",
        "Sb1 Funding": "Sum of Sb1 Funding",
        "Ggrf Funding": "Sum of GGRF Funding",
        "101 Tot Exp": "Sum of SB1 (0046, xx101 Expenditure)",
        "301R Tot Exp": "Sum of GGRF (0046, xx301R expenditure)",
        "Billed": "Sum Billed",
        "Reimbursements": "Sum of Reimbursements",
    })
    
    # Create sum columns
    df["Sum of Expenditure"] = df["Sum of SB1 (0046, xx101 Expenditure)"] + df["Sum of GGRF (0046, xx301R expenditure)"]
    df["Remaining Allocation"] = df["Sum Of Allocation Amount"] - df["Sum of Expenditure"]
    
    # Rearrange columns to be the right order
    df = df[right_col_order]
    
    # Apply the comment function
    df["Comments"] = df.apply(comments, axis=1)
    
    # Fill more NA
    for i in ["Sum Billed", "Sum of Reimbursements"]:
        df[i] = df[i].fillna(0)
    
    # Aggregate
    agg1 = (df
            .groupby('Comments')
            .agg({'Project ID':'count','Remaining Allocation':'sum',})
            .reset_index()
            .sort_values('Project ID', ascending = False)
            .rename(columns = {'Project ID':'Total Projects'})
           )
    
    agg1['Remaining Allocation'] = agg1['Remaining Allocation'].map("${:,.2f}".format)
    
    # Export
    with pd.ExcelWriter(f"{A1_data_prep.GCS_FILE_PATH}accounting_analysis.xlsx") as writer:
        df.to_excel(writer, sheet_name="accounting_analysis", index=False)
        agg1.to_excel(writer, sheet_name="summary", index=True)
  
    return df

In [115]:
accounting_analysis = final_accounting_analysis()

In [103]:
agg1 = accounting_analysis.groupby('Comments').agg({'Project ID':'count','Remaining Allocation':'sum',}).reset_index()

In [104]:
agg1['Remaining Allocation'] = agg1['Remaining Allocation'].map("${:,.2f}".format)

In [105]:
agg1.sort_values('Project ID', ascending = False)

Unnamed: 0,Comments,Project ID,Remaining Allocation
6,No expenditures: correct in AMS,90,"$617,434,000.00"
4,More than $400k: need to correct funding,49,"$659,115,031.78"
3,Less than $400k: no action needed,22,"$3,803,122.25"
2,Fully expended: no action needed,18,$0.00
0,100% SB1/0046: no action needed,16,"$7,698,372.45"
5,Negative Remaining Allocation,9,"$-63,125,367.60"
1,Component Split: correct in AMS,3,"$-27,822,763.48"


In [106]:
preview_cols = ["Award Year",
    "Title",
    "Project ID",
    "Ea",
    "Comments",
    "Sum of Sb1 Funding",
    "Sum of GGRF Funding",
    "Sum Of Allocation Amount",
    "Sum of GGRF (0046, xx301R expenditure)",
    "Sum of SB1 (0046, xx101 Expenditure)",
    "Sum of Expenditure",
    "Sum Billed",
    "Sum of Reimbursements",
    "Remaining Allocation",
    "Allocation Expenditure Merge",
    "Allocation Project Status Merge"]

In [107]:
# accounting_analysis['Project ID'].nunique() == alloc.allocation_project_id.nunique()

In [108]:
# accounting_analysis[preview_cols].loc[accounting_analysis.Comments == "More than $400k: need to correct funding"]

In [109]:
# accounting_analysis.loc[accounting_analysis["Project ID"] == "22000241"]