In [37]:
import numpy as np
import pandas as pd

In [38]:
def _clean_cpi_dataframe(raw_df):
    """
    Helper function:
    Takes the messy CPI Excel sheet and converts it into
    a clean DataFrame with proper column names, numeric months,
    and only valid year rows.

    Parameters
    ----------
    raw_df : pd.DataFrame
        The raw Excel table read from the CPI file.

    Returns
    -------
    pd.DataFrame
        Cleaned DataFrame with proper columns, numeric years,
        and numeric month values.
    """

    # Extract header row (row 2 in the Excel file)
    header_row = raw_df.iloc[2].astype(str).str.strip()
    raw_df.columns = header_row

    # Drop the first 3 junk rows and reset
    df = raw_df.drop(index=[0, 1, 2]).reset_index(drop=True)

    # Keep only rows where "Year" looks like a number
    df = df[df["Year"].apply(lambda x: str(x).isdigit())]
    df["Year"] = df["Year"].astype(int)

    # Month column order
    months = [
        "Jan", "Feb", "Mar", "Apr", "May", "Jun",
        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
    ]

    # Convert month values to numeric
    df[months] = df[months].apply(pd.to_numeric, errors="coerce")

    return df


def process_cpi_wide(path, outfile):
    """
    Process CPI data from a wide Excel format into a cleaned
    month-by-month inflation wide CSV.

    This function:
    1. Loads the CPI Excel file.
    2. Cleans the messy header + rows.
    3. Converts wide month columns into long format.
    4. Creates real calendar dates.
    5. Calculates month-over-month inflation percent change.
    6. Pivots it back into a clean wide table (Year × Month).
    7. Saves the cleaned inflation CSV.

    Parameters
    ----------
    path : str
        File path to the CPI Excel file.
    outfile : str
        Output CSV filename.

    Returns
    -------
    pd.DataFrame
        Final wide-format inflation table.
    """

    # Read raw Excel (skip first 7 rows based on CPI structure)
    raw_df = pd.read_excel(path, skiprows=7, header=None)

    # Clean the raw dataframe using helper function
    df = _clean_cpi_dataframe(raw_df)

    # Month order list
    months = [
        "Jan", "Feb", "Mar", "Apr", "May", "Jun",
        "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
    ]

    # Convert wide → long format
    long_df = df.melt(
        id_vars="Year",
        value_vars=months,
        var_name="Month",
        value_name="CPI"
    )

    # Map month abbreviations to numbers
    month_map = {
        "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4,
        "May": 5, "Jun": 6, "Jul": 7, "Aug": 8,
        "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
    }
    long_df["MonthNum"] = long_df["Month"].map(month_map)

    # Build actual datetime column
    long_df["Date"] = pd.to_datetime(
        dict(
            year=long_df["Year"],
            month=long_df["MonthNum"],
            day=1
        )
    )

    # Sort chronologically
    long_df = long_df.sort_values("Date").reset_index(drop=True)

    # Calculate month-over-month inflation % change
    long_df["Inflation"] = (
        (long_df["CPI"] - long_df["CPI"].shift(1))
        / long_df["CPI"].shift(1)
    ) * 100

    # The first row cannot have inflation
    long_df.loc[0, "Inflation"] = np.nan

    # Convert long → wide (Year × Month)
    wide_df = long_df.pivot(
        index="Year",
        columns="Month",
        values="Inflation"
    )

    # Re-order columns consistently
    wide_df = wide_df[months].reset_index()

    # Save to CSV
    wide_df.to_csv(outfile, index=False)

    return wide_df

In [39]:

eggs = process_cpi_wide("CPI Eggs Data 2010 - Sep. 2025.xlsx", "eggs_clean.csv")
coffee = process_cpi_wide("CPI Coffee Data 2010 - Sep. 2025.xlsx", "coffee_clean.csv")
bread = process_cpi_wide("CPI Bread Data 2010 - Sep. 2025.xlsx", "bread_clean.csv")

print("Eggs:\n", eggs.head(), "\n")
print("Coffee:\n", coffee.head(), "\n")
print("Bread:\n", bread.head(), "\n")


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")


Eggs:
 Month  Year       Jan       Feb       Mar       Apr        May       Jun  \
0      2010       NaN  4.639463 -2.670940 -2.360044 -14.390107 -1.904137   
1      2011  0.725042 -5.426357  1.405152 -0.288684  -2.026636 -0.531915   
2      2012  3.468517 -7.271790 -1.334816  3.100338  -7.545107 -1.241869   
3      2013 -3.687095  1.655458 -2.035623 -0.363636  -2.502607 -0.588235   
4      2014 -0.888450 -0.498008  3.153153  2.814168  -5.804625 -2.404810   

Month       Jul        Aug        Sep        Oct        Nov       Dec  
0     -3.547523   5.412908  15.404872 -16.942384  15.041209  7.044776  
1     -2.139037   3.885853  13.793103  -3.903441  -1.870657  2.069717  
2     -1.317365  14.320388   0.265393   3.758602   0.153061  2.241467  
3     -1.398601   0.272777   3.210011   1.476015   0.000000  5.246753  
4      0.102669   1.487179  -0.454775  -0.964467   4.151717  8.759843   

Coffee:
 Month  Year       Jan       Feb        Mar       Apr       May       Jun  \
0      2010      

In [36]:
help(process_cpi_chained)

Help on function process_cpi_chained in module __main__:

process_cpi_chained(path, outfile)
    Reads a CPI Excel file, cleans it, converts it to long format,
    computes monthly inflation, and saves the processed file as CSV.

    Parameters:
        path (str): Path to the raw CPI Excel file.
        outfile (str): Path where the cleaned CSV should be saved.

    Returns:
        pandas.DataFrame: Long-format CPI dataset with inflation rates.

