In [23]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../data/original/consumermoney2013.csv')

# Function to process the data and group by categories
def process_consumer_spending(df):
    # Forward fill NaN values in first column to identify categories
    df.iloc[:, 0] = df.iloc[:, 0].fillna('')
    
    # Initialize variables
    categories = []
    current_category = None
    grouped_data = []
    
    for index, row in df.iterrows():
        item = row.iloc[0]
        
        # Skip completely empty rows
        if item == '':
            current_category = None
            continue
        
        # Check if this is a main category (no indentation and not a subcategory)
        if not item.startswith(' ') and ',' not in item and ':' not in item:
            # We found a new main category
            current_category = item
            if current_category not in categories:
                categories.append(current_category)
        
        # Create a row with category information
        new_row = row.copy()
        if current_category:
            new_row['Category'] = current_category
            grouped_data.append(new_row)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(grouped_data)
    
    # Clean up the monetary values
    for year in range(2013, 2020):
        year_col = str(year)
        if year_col in result_df.columns:
            result_df[year_col] = result_df[year_col].apply(lambda x: 
                pd.to_numeric(str(x).replace('$', '').replace(',', ''), errors='coerce')
                if isinstance(x, str) else x)
    
    # Rename columns for clarity
    result_df.rename(columns={result_df.columns[0]: 'Item'}, inplace=True)
    
    return result_df

# Process the data
processed_df = process_consumer_spending(df)

# Example: Display items in the "Food" category
food_items = processed_df[processed_df['Category'] == 'Food']
print("Food category items:")
print(food_items[['Item', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']])

# Example: Display items in the "Housing" category
housing_items = processed_df[processed_df['Category'] == 'Housing']
print("\nHousing category items:")
print(housing_items[['Item', '2013', '2014', '2015', '2016', '2017', '2018', '2019','2020']])

# Get the list of all categories
all_categories = processed_df['Category'].unique()
print("\nAll identified categories:")
print(all_categories)

# Save the processed data to a new CSV file
processed_df.to_csv('processed_consumer_spending2.csv', index=False)
print("\nProcessed data saved to 'processed_consumer_spending2.csv'")

Food category items:
     Item    2013    2014    2015    2016    2017    2018    2019   2020
2    Food  6602.0  6759.0  7023.0  7203.0  7729.0  7923.0  8169.0  7,316
149  Food    78.0    75.0    91.0    74.0    88.0    91.0    98.0     c/

Housing category items:
        Item     2013     2014     2015     2016     2017     2018     2019  \
32   Housing  17148.0  17798.0  18409.0  18886.0  19884.0  20091.0  20679.0   
151  Housing    201.0    222.0    244.0    238.0    233.0    260.0    248.0   

       2020  
32   21,409  
151      c/  

All identified categories:
['Average annual expenditures' 'Food' 'Food at home'
 'Cereals and bakery products' 'Cereals and cereal products'
 'Bakery products' 'Beef' 'Pork' 'Other meats' 'Poultry'
 'Fish and seafood' 'Eggs' 'Dairy products' 'Fresh milk and cream'
 'Other dairy products' 'Fruits and vegetables' 'Fresh fruits'
 'Fresh vegetables' 'Processed fruits' 'Processed vegetables'
 'Other food at home' 'Sugar and other sweets' 'Fats and oils'
 