In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Function to read and clean the CSV files
def read_and_clean_csv(filename):
    # Read the CSV file
    df = pd.read_csv(filename)
    
    # Clean column names by stripping quotes and whitespace
    df.columns = df.columns.str.strip('"').str.strip()
    
    # For the first column, remove quotation marks and whitespace
    df.iloc[:, 0] = df.iloc[:, 0].str.strip('"').str.strip() if df.iloc[:, 0].dtype == 'object' else df.iloc[:, 0]
    
    return df

# Function to extract main categories
def extract_main_categories(df):
    # Identify main categories by finding rows that come right after blank rows
    # First, identify blank rows (rows where the first column is empty or NaN)
    blank_rows = df.iloc[:, 0].isna() | (df.iloc[:, 0] == '')
    
    # Find rows that come right after blank rows
    main_category_indices = []
    for i in range(1, len(blank_rows)):
        if blank_rows[i-1] and not blank_rows[i]:
            main_category_indices.append(i)
    
    # Also include the first row (which contains "Average Annual Expenditure")
    if not blank_rows[0]:
        main_category_indices.insert(0, 0)
    
    # Extract main categories and their data
    main_categories_df = df.iloc[main_category_indices].copy()
    
    return main_categories_df

# Read and clean the CSV files
df1 = read_and_clean_csv('../data/original/consumer06csv.csv')
df2 = read_and_clean_csv('../data/original/consumer13csv.csv')

# Extract main categories from each file
main_categories_df1 = extract_main_categories(df1)
main_categories_df2 = extract_main_categories(df2)

# Convert dollar values to numeric, removing $ and commas
def clean_dollar_values(df):
    for col in df.columns[1:]:  # Skip the first column (Item)
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace('$', '', regex=False)
            df[col] = df[col].str.replace(',', '', regex=False)
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

main_categories_df1 = clean_dollar_values(main_categories_df1)
main_categories_df2 = clean_dollar_values(main_categories_df2)

# Combine the two dataframes
# First, prepare the dataframes by setting the 'Item' column as index
main_categories_df1.set_index('Item', inplace=True)
main_categories_df2.set_index('Item', inplace=True)

# Now combine them
combined_df = pd.concat([main_categories_df1, main_categories_df2], axis=1)

# Reset index to make 'Item' a column again
combined_df.reset_index(inplace=True)

# Display the combined dataframe
print("Combined data for main spending categories (2006-2020):")
print(combined_df)

# Save the combined data to a new CSV file
combined_df.to_csv('combined_consumer_spending_2006_2020.csv', index=False)
print("\nData saved to 'combined_consumer_spending_2006_2020.csv'")

Combined data for main spending categories (2006-2020):
                                              Item     2006     2007     2008  \
0                       Average Annual Expenditure  48400.0  49638.0  50486.0   
1                              Alcoholic beverages    497.0    457.0    444.0   
2                                          Housing  16366.0  16920.0  17109.0   
3                             Apparel and services   1874.0   1881.0   1801.0   
4                                   Transportation   8508.0   8758.0   8604.0   
5                                      Health care   2766.0   2853.0   2976.0   
6                                    Entertainment   2377.0   2698.0   2835.0   
7              Personal care products and services    585.0    588.0    616.0   
8                                          Reading    117.0    118.0    116.0   
9                                        Education    888.0    945.0   1046.0   
10           Tobacco products and smoking supplies   

In [9]:
combined_df.dtypes

Item     object
2006    float64
2007    float64
2008    float64
2009    float64
2010    float64
2011    float64
2012    float64
2013    float64
2014    float64
2015    float64
2016    float64
2017    float64
2018    float64
2019    float64
2020    float64
dtype: object

In [11]:
combined_df.describe(include='all')

Unnamed: 0,Item,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
count,29,18.0,18.0,18.0,18.0,18.0,18.0,18.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
unique,29,,,,,,,,,,,,,,,
top,Average Annual Expenditure,,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,,
mean,,11372.388889,12044.777778,12088.111111,11781.166667,11699.611111,12138.722222,12354.666667,12693.315789,13061.894737,13586.736842,14351.157895,14806.052632,15389.368421,15933.947368,15523.947368
std,,21005.255021,21644.197289,21987.30081,21671.92215,21488.303191,21816.884694,22589.218261,20352.596583,21264.630114,22131.195215,23367.363813,23375.902831,24613.734019,25909.071358,26586.878952
min,,-7053.0,-2520.0,-4072.0,-5416.0,-4458.0,-1826.0,-5092.0,102.0,103.0,114.0,118.0,110.0,108.0,92.0,114.0
25%,,650.25,643.0,672.0,651.0,648.75,669.25,678.25,891.5,1009.0,1093.0,1144.0,1250.5,1200.0,1171.0,1089.0
50%,,2125.5,2057.0,1795.0,1914.5,1734.5,1876.0,2069.5,3631.0,4290.0,4342.0,4612.0,4928.0,4968.0,5193.0,3584.0
75%,,7698.5,7902.5,7854.25,7111.25,7101.0,7575.75,8146.25,10204.0,8793.0,9340.5,9769.0,12008.0,12036.5,11167.5,9614.0
