In [1]:
import pandas as pd
from pathlib import Path
import openpyxl
# Clean and preprocess the budget data, handling missing values, standardizing formats, 
# and ensuring data consistency. 
# Perform initial checks to ensure the dataset is ready for analysis.
budget_path_2019 = Path("Resources/approved-operating-budget-summary-2019.xlsx") #2019
budget2019_df = pd.read_excel(budget_path_2019, sheet_name='2019')
budget_path_2020 = Path("Resources/approved-operating-budget-summary-2020.xlsx") #2020
budget2020_df = pd.read_excel(budget_path_2020, sheet_name='open data')
budget_path_2021 = Path("Resources/approved-operating-budget-summary-2021.xlsx") #2021
budget2021_df = pd.read_excel(budget_path_2021, sheet_name='2021')
budget_path_2022 = Path("Resources/approved-operating-budget-summary-2022.xlsx") #2022
budget2022_df = pd.read_excel(budget_path_2022, sheet_name='Open Data')
budget_path_2023 = Path("Resources/approved-operating-budget-summary-2023.xlsx") #2023
budget2023_df = pd.read_excel(budget_path_2023, sheet_name='Open Data')

In [28]:
# Changing the names of the excel worksheets so they are consistent
workbook2019 = openpyxl.load_workbook(budget_path_2019) #opening the 2019 approved budget summary workbook
rename_2019_sheet = workbook2019['2019'] #selecting the excel sheet to rename
rename_2019_sheet.title = 'Open Data' #changing the title 
workbook2019.save(budget_path_2019)  #saving the modified file using the original file path
workbook2019.close()  #closing the workbook
#Only runs once and permanently changes the name

In [29]:
workbook2021 = openpyxl.load_workbook(budget_path_2021) #opening the 2019 approved budget summary workbook
sheet_names = workbook2021.sheetnames # Get the sheet names of the workbook
if '2021' in sheet_names: # if '2021' is in the sheet names
    rename_2021_sheet = workbook2021['2021']  # Selecting the Excel sheet to rename
    rename_2021_sheet.title = 'Open Data'  # Changing the title
    # Saving the modified file using the original file path
    workbook2021.save(budget_path_2021)
# Closing the workbook
workbook2021.close()

In [None]:
#Checking for null values

In [94]:
null_2019 = budget2019_df.isnull()   
null_per_column_2019 = null_2019.sum()
null_2020 = budget2020_df.isnull() 
null_per_column_2020 = null_2020.sum()
null_2021  = budget2021_df.isnull() 
null_per_column_2021 = null_2021.sum()
null_2022 = budget2022_df.isnull()  
null_per_column_2022 = null_2022.sum()
null_2023 = budget2023_df.isnull()   
null_per_column_2023 = null_2023.sum()

In [95]:
print(null_per_column_2019)

Program              0
Service              0
Activity             0
Expense/Revenue      0
Category Name        0
Sub-Category Name    0
Commitment item      0
2019                 0
dtype: int64


In [96]:
print(null_per_column_2020)

Program              0
Service              0
Activity             0
Expense/Revenue      0
Category Name        0
Sub-Category Name    0
Commitment item      0
2020                 0
dtype: int64


In [97]:
print(null_per_column_2021)

Program              0
Service              0
Activity             0
Expense/Revenue      0
Category Name        0
Sub-Category Name    0
Commitment item      0
2021                 0
dtype: int64


In [98]:
print(null_per_column_2022)

Program              0
Service              0
Activity             0
Expense/Revenue      0
Category Name        0
Sub-Category Name    0
Commitment item      0
2022                 0
dtype: int64


In [99]:
print(null_per_column_2023)

Program              0
Service              0
Activity             0
Expense/Revenue      0
Category Name        0
Sub-Category Name    0
Commitment item      0
2023                 0
dtype: int64


In [100]:
df_list = [budget2019_df,budget2020_df,budget2021_df,budget2022_df,budget2023_df]

In [101]:
#Renaming the last column in the data set to Budgeted Amount 
for i in range(len(df_list)):
    year = 2019 + i
    df_list[i].rename(columns={year: 'Budgeted amount'}, inplace=True)

In [None]:
#Dropping duplicates

In [102]:
for i in range(len(df_list)):
    df_list[i].drop_duplicates(subset='Budgeted amount', inplace=True)

In [None]:
#Changing the scientific notation to the power of 6

In [103]:
format_sci_notation = lambda x: '{:.6e}'.format(x)
for df in df_list:
    df['Budgeted amount'] = df['Budgeted amount'].apply(format_sci_notation)

In [None]:
#Seperating expenses and revenues into separate dataframes

In [105]:
budget2019_df_exp = budget2019_df[budget2019_df['Expense/Revenue']=='Expenses']
budget2020_df_exp = budget2020_df[budget2020_df['Expense/Revenue']=='Expenses']
budget2021_df_exp = budget2021_df[budget2021_df['Expense/Revenue']=='Expenses']
budget2022_df_exp = budget2022_df[budget2022_df['Expense/Revenue']=='Expenses']
budget2023_df_exp = budget2023_df[budget2023_df['Expense/Revenue']=='Expenses']

In [106]:
budget2019_df_rev = budget2019_df[budget2019_df['Expense/Revenue']=='Revenues']
budget2020_df_rev = budget2020_df[budget2020_df['Expense/Revenue']=='Revenues']
budget2021_df_rev = budget2021_df[budget2021_df['Expense/Revenue']=='Revenues']
budget2022_df_rev = budget2022_df[budget2022_df['Expense/Revenue']=='Revenues']
budget2023_df_rev = budget2023_df[budget2023_df['Expense/Revenue']=='Revenues']