# Exploring and Cleaning Data

In the following code I will be performing both exploratory analysis and cleaning the data provided within three excel spreadsheets. These spreadsheets contain budget information for the City of Missoula and aligining Program Inventory information surrounding different programs that are funded by the City of Missoula. 

The data provided is messy and comes from both the financial software that is used, but also from a different software that collects survey responses. The data in the excel files is messy and unclean. This process will load the files into the environment, perform data cleaning functions to prepare the data for manipulation in Power Bi. In Power BI, I will create a dashboard that provides information about all of the different programs currently funded and provide granular breakdowns for financial and other important information therein. 

Once the data is cleaned I will conduct some exploratory analysis on the newly cleaned data to see how it performs. 

In [None]:
### Importing Libraries
# Standard Libraries
import os

# Text Cleaning
import re

# Data Handling
import pandas as pd
import numpy as np

# Excel Handling
import openpyxl

# Additional Cleaning Utilities; each labeled below
from tqdm import tqdm  # Adds progress bars to loops
import chardet  # Detects encoding issues
import janitor

# Visualization
import missingno as msno
import matplotlib.pyplot as plt

In [None]:
## define the file paths 
data_path = "data/"

## Load Files; only the first sheet in expenditure and program inventory, all of the sheets in revenue expense. 
revenue_expense = pd.read_excel(
    os.path.join(data_path, "FY24_Revenue_Expense_Data.xlsx"), sheet_name=None, engine="openpyxl")
expenditure_status = pd.read_excel(
    os.path.join(data_path, "FY24_Expenditure_Status.xlsx"), sheet_name=0, skiprows=6, engine="openpyxl")
program_inventory = pd.read_excel(
    os.path.join(data_path, "Program_Inventory_Internal_Data_Collection.xlsx"), sheet_name=0, engine="openpyxl")

In [None]:
## Display initial previews
print("Revenue Expense Loaded:", revenue_expense.keys())
print("\n Expenditure Status Preview:")
print(expenditure_status.head())

print("\n Program Inventory Preview:")
print(program_inventory.head())

In [None]:
## Define Cleaning Functions

def drop_unnamed_columns(df):
    """Drop columns with 'Unnamed' that are fully NaN."""
    return df.loc[:, ~df.columns.str.contains("^Unnamed", na=False)]

def clean_numeric_column(column):
    """Remove trailing '.0' and convert to clean string."""
    return column.astype(str).str.replace(r"\.0$", "", regex=True)

def clean_identifiers(df):
    """Standardize Fund #, Activity Code, and Dept #."""
    if "Fund #" in df.columns:
        df["Fund #"] = df["Fund #"].astype(str).str.split(".").str[0]

    if "Activity Code" in df.columns:
        df["Activity Code"] = df["Activity Code"].astype(str).str.split(".").str[0].str.zfill(6)

    if "Dept #" in df.columns:
        df["Dept #"] = df["Dept #"].astype(str).str.split(".").str[0].str.zfill(3)

    return df

def rename_multiline_headers(df):
    """Expand headers like 'Cost Recovery' to match multi-column layout."""
    column_mappings = {
        "Cost Recovery": ["Cost Recovery E58", "Cost Recovery P24"],
        "Mandate": ["Mandate E41", "Mandate H41", "Mandate E43"],
        "Service Level": ["Service Level E47", "Service Level H47", "Service Level E49"],
        "Reliance & Interdependencies": ["Reliance E53", "Reliance E55"],
        "Strategic Goal": ["Strategic Goal E64", "Strategic Goal E66", "Strategic Goal E68", "Strategic Goal E74", "Strategic Goal E80"],
        "Trend (Demand)": ["Trend Demand E87", "Trend Demand E89"],
        "Risk": ["Risk E93", "Risk E95"]
    }

    new_columns = []
    for col in df.columns:
        if col in column_mappings:
            new_columns.extend(column_mappings[col])
        else:
            new_columns.append(col)

    df.columns = new_columns[:len(df.columns)]
    return df

In [None]:
## Data Cleaning Process for Expenditure Status
# Drop empty unnamed columns
df_expenditure_status = drop_unnamed_columns(expenditure_status).copy()

# Rename first column
df_expenditure_status.columns.values[0] = "Account Number"

# Define split column names
split_cols = ["Fund #", "Dept #", "Activity Code", "Object Code", "Sub-object Code"]

# Split account numbers into structure
split_data = df_expenditure_status["Account Number"].astype(str).str.split(".", expand=True, n=4)
split_data.columns = split_cols

# Merge back into df_expenditure_status
df_expenditure_status = pd.concat([df_expenditure_status, split_data], axis=1)

# Convert types safely
for col in split_cols:
    df_expenditure_status[col] = pd.to_numeric(df_expenditure_status[col], errors="coerce").astype("Int64")

# Forward fill Fund and Dept
df_expenditure_status[["Fund #", "Dept #"]] = df_expenditure_status[["Fund #", "Dept #"]].ffill()

# Drop rows missing Activity Code
df_expenditure_status = df_expenditure_status.dropna(subset=["Activity Code"])

# Reset index
df_expenditure_status = df_expenditure_status.reset_index(drop=True)

# ✅ Checkpoint
print("\n🧼 Cleaned Expenditure Status Columns:")
print(df_expenditure_status.columns.tolist())
print(df_expenditure_status.head())

In [None]:
## Define Mapping
# Extract from Expenditure
dept_mapping = df_expenditure_status[["Dept #"]].drop_duplicates().dropna()
fund_mapping = df_expenditure_status[["Fund #"]].drop_duplicates().dropna()

# Assign human-readable department names (optional or manual override)
dept_name_lookup = {
    210: "City Council", 220: "Mayor", 230: "Finance", 240: "Human Resources",
    250: "Legal", 260: "Police", 270: "Fire", 280: "Public Works",
    290: "Parks & Recreation", 300: "Planning & Development", 310: "Library",
    320: "IT Services", 330: "Community Development", 340: "Housing Services"
}
dept_mapping["Department"] = dept_mapping["Dept #"].map(dept_name_lookup).fillna("REDACTED")
df_expenditure_status = pd.merge(df_expenditure_status, dept_mapping, on="Dept #", how="left")

# Optional: Display mappings
print("\n🔎 Extracted Department Mapping:")
print(dept_mapping.head())

In [None]:
df_expenditure_status

In [None]:
## Clean and Process Program Inventory
# Rename Org to match Expenditure 'Dept #'
df_program_inventory = program_inventory.rename(columns={"Org": "Dept #"})

# Clean identifiers
df_program_inventory = clean_identifiers(df_program_inventory)

# Apply mappings
df_program_inventory["Department"] = df_program_inventory["Dept #"].map(dept_name_lookup).fillna("REDACTED")
df_program_inventory["Fund Name"] = df_program_inventory["Fund"].map(fund_mapping.set_index("Fund #").index.to_series())

# Fix multi-line headers
df_program_inventory = rename_multiline_headers(df_program_inventory)

# Clean out empty columns
df_program_inventory = drop_unnamed_columns(df_program_inventory)

# Checkpoint
print("\n🧼 Cleaned Program Inventory Preview:")
print(df_program_inventory.head())

In [None]:
# Display the final cleaned datasets
print("\n✅ Final Cleaned Expenditure Status (Sample):")
print(df_expenditure_status.head(10))

print("\n✅ Final Cleaned Program Inventory (Sample):")
print(df_program_inventory.head(10))

In [None]:
## When using the comments in line, it allows you to easily reference them later by using help(function)
## help(fix_multiline_headers)