In [2]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../data/original/consumerspending-justmoney.csv')

# Function to process the data and group by categories
def process_consumer_spending(df):
    # Forward fill NaN values in first column to identify categories
    df.iloc[:, 0] = df.iloc[:, 0].fillna('')
    
    # Initialize variables
    categories = []
    current_category = None
    grouped_data = []
    
    for index, row in df.iterrows():
        item = row.iloc[0]
        
        # Skip completely empty rows
        if item == '':
            current_category = None
            continue
        
        # Check if this is a main category (no indentation and not a subcategory)
        if not item.startswith(' ') and ',' not in item and ':' not in item:
            # We found a new main category
            current_category = item
            if current_category not in categories:
                categories.append(current_category)
        
        # Create a row with category information
        new_row = row.copy()
        if current_category:
            new_row['Category'] = current_category
            grouped_data.append(new_row)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(grouped_data)
    
    # Clean up the monetary values
    for year in range(2006, 2013):
        year_col = str(year)
        if year_col in result_df.columns:
            result_df[year_col] = result_df[year_col].apply(lambda x: 
                pd.to_numeric(str(x).replace('$', '').replace(',', ''), errors='coerce')
                if isinstance(x, str) else x)
    
    # Rename columns for clarity
    result_df.rename(columns={result_df.columns[0]: 'Item'}, inplace=True)
    
    return result_df

# Process the data
processed_df = process_consumer_spending(df)

# Example: Display items in the "Food" category
food_items = processed_df[processed_df['Category'] == 'Food']
print("Food category items:")
print(food_items[['Item', '2006', '2007', '2008', '2009', '2010', '2011', '2012']])

# Example: Display items in the "Housing" category
housing_items = processed_df[processed_df['Category'] == 'Housing']
print("\nHousing category items:")
print(housing_items[['Item', '2006', '2007', '2008', '2009', '2010', '2011', '2012']])

# Get the list of all categories
all_categories = processed_df['Category'].unique()
print("\nAll identified categories:")
print(all_categories)

# Function to analyze spending trends by category
def analyze_category_spending(df, category):
    category_data = df[df['Category'] == category]
    
    # Get the main category total (should be the first row)
    main_item = category_data.iloc[0]
    
    print(f"\nAnalysis for {category}:")
    print(f"Total spending in 2006: ${main_item['2006']:,.0f}")
    print(f"Total spending in 2012: ${main_item['2012']:,.0f}")
    
    change = (main_item['2012'] - main_item['2006']) / main_item['2006'] * 100
    print(f"Percent change from 2006 to 2012: {change:.1f}%")
    
    # Get top subcategories by 2012 spending
    subcategories = category_data.iloc[1:]  # Skip the main category row
    top_items = subcategories.nlargest(3, '2012')
    
    print(f"Top 3 subcategories by 2012 spending:")
    for idx, item in top_items.iterrows():
        print(f"  {item['Item']}: ${item['2012']:,.0f}")

# Example usage of the analysis function
analyze_category_spending(processed_df, 'Food')
analyze_category_spending(processed_df, 'Housing')

# Save the processed data to a new CSV file
processed_df.to_csv('processed_consumer_spending.csv', index=False)
print("\nProcessed data saved to 'processed_consumer_spending.csv'")

Food category items:
     Item    2006    2007  2008    2009  2010    2011    2012
1    Food  6111.0  6133.0  6443  6372.0  6129  6458.0  6599.0
145  Food   117.0    93.0    97    96.0    89    84.0   104.0

Housing category items:
        Item     2006     2007   2008     2009   2010     2011     2012
31   Housing  16366.0  16920.0  17109  16895.0  16557  16803.0  16887.0
147  Housing    264.0    225.0    229    202.0    199    194.0    191.0

All identified categories:
['Average Annual Expenditure ' 'Food' 'Food at home'
 'Cereals and bakery products' 'Cereals and cereal products'
 'Bakery products' 'Beef' 'Pork' 'Other meats' 'Poultry'
 'Fish and seafood' 'Eggs' 'Dairy products' 'Fresh milk and cream'
 'Other dairy products' 'Fruits and vegetables' 'Fresh fruits'
 'Fresh vegetables' 'Processed fruits' 'Processed vegetables'
 'Other food at home' 'Sugar and other sweets' 'Fats and oils'
 'Miscellaneous foods' 'Nonalcoholic beverages'
 'Food prepared by consumer unit on out-of-town tr

In [3]:
six_df = pd.read_csv('../data/processed/processed_consumer_spending.csv')

In [4]:
six_df.info

<bound method DataFrame.info of                                        Item     2006     2007   2008     2009  \
0               Average Annual Expenditure   48400.0  49638.0  50486  49067.0   
1                                      Food   6111.0   6133.0   6443   6372.0   
2                              Food at home   3417.0   3465.0   3744   3753.0   
3               Cereals and bakery products    446.0    460.0    507    506.0   
4               Cereals and cereal products    143.0    143.0    170    173.0   
..                                      ...      ...      ...    ...      ...   
141                     Other entertainment     52.0     68.0     59     57.0   
142  Personal care products and services d/     16.0     18.0     12     12.0   
143                              Reading d/      1.0      1.0      1      1.0   
144                               Education    210.0    283.0    324    229.0   
145                      All other gifts d/     94.0     93.0     99     76.0

In [5]:
six_df.dtypes

Item         object
2006        float64
2007        float64
2008          int64
2009        float64
2010          int64
2011        float64
2012        float64
Category     object
dtype: object

In [6]:
year_columns = [str(year) for year in range(2006, 2013)]

# Check which columns exist in the dataframe
existing_year_columns = [col for col in year_columns if col in df.columns]

# Convert float columns to integers
for col in existing_year_columns:
    # Check if the column has float values
    if df[col].dtype == 'float64':
        # Convert to integer, handling NaN values if present
        df[col] = df[col].fillna(0).astype(int)

# Display the updated data types
print("Updated data types after conversion:")
print(df.dtypes)
print("\n")

# Display a sample of the data to verify the conversion
print("Sample data after conversion:")
print(df.head())

# Save the cleaned data to a new CSV file
df.to_csv('cleaned_consumer_spending.csv', index=False)
print("\nCleaned data saved to 'cleaned_consumer_spending.csv'")

Updated data types after conversion:
Item    object
2006    object
2007    object
2008    object
2009    object
2010    object
2011    object
2012    object
dtype: object


Sample data after conversion:
                          Item     2006     2007     2008     2009     2010  \
0  Average Annual Expenditure   $48,400  $49,638  $50,486  $49,067  $48,109   
1                         Food    6,111    6,133    6,443    6,372    6,129   
2                 Food at home    3,417    3,465    3,744    3,753    3,624   
3  Cereals and bakery products      446      460      507      506      502   
4  Cereals and cereal products      143      143      170      173      165   

      2011     2012  
0  $49,705  $51,442  
1    6,458    6,599  
2    3,838    3,921  
3      531      538  
4      175      182  

Cleaned data saved to 'cleaned_consumer_spending.csv'


In [7]:
sixclean_df = pd.read_csv('cleaned_consumer_spending.csv')

In [8]:
sixclean_df.dtypes

Item    object
2006    object
2007    object
2008    object
2009    object
2010    object
2011    object
2012    object
dtype: object

In [9]:
pd.set_option('display.max_rows', None)

In [35]:
sixclean_df.duplicated(keep=False)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
31     False
32     False
33     False
34     False
35     False
36     False
37     False
38     False
39     False
40     False
41     False
42     False
43     False
44     False
45     False
46     False
47     False
48     False
49     False
50     False
51     False
52     False
53     False
54     False
55     False
56     False
57     False
58     False
60     False
61     False
62     False
63     False
64     False
65     False
66     False
67     False
68     False
69     False
71     False
72     False
73     False
74     False
75     False
76     False
77     False
78     False
79     False

In [37]:
sixclean_df = sixclean_df.drop_duplicates(keep='first')

In [45]:
sixclean_df[sixclean_df.duplicated(keep=False)]

Unnamed: 0,Item,2006,2007,2008,2009,2010,2011,2012


In [47]:
# Define the main sections to keep
main_sections = [
    "Food",
    "Housing",
    "Apparel and services",
    "Transportation",
    "Health care",
    "Entertainment", 
    "Education",
    "Personal insurance and pensions"
]

# Filter the DataFrame to only include the main sections
main_df = df[df['Item'].isin(main_sections)]

# Save the new DataFrame to a CSV file
main_df.to_csv('consumer_expenditure_main_sections.csv', index=False)

print(f"Extracted {len(main_df)} main sections to a new CSV file.")
print(main_df)

Extracted 15 main sections to a new CSV file.
                                Item    2006    2007    2008    2009    2010  \
1                               Food   6,111   6,133   6,443   6,372   6,129   
31                           Housing  16,366  16,920  17,109  16,895  16,557   
60              Apparel and services   1,874   1,881   1,801   1,725   1,700   
71                    Transportation   8,508   8,758   8,604   7,658   7,677   
84                       Health care   2,766   2,853   2,976   3,126   3,157   
90                     Entertainment   2,377   2,698   2,835   2,693   2,504   
100                        Education     888     945   1,046   1,068   1,074   
108  Personal insurance and pensions   5,270   5,336   5,605   5,471   5,373   
145                             Food     117      93      97      96      89   
147                          Housing     264     225     229     202     199   
155             Apparel and services     247     241     223     237     2

In [49]:
main_df

Unnamed: 0,Item,2006,2007,2008,2009,2010,2011,2012
1,Food,6111,6133,6443,6372,6129,6458,6599
31,Housing,16366,16920,17109,16895,16557,16803,16887
60,Apparel and services,1874,1881,1801,1725,1700,1740,1736
71,Transportation,8508,8758,8604,7658,7677,8293,8998
84,Health care,2766,2853,2976,3126,3157,3313,3556
90,Entertainment,2377,2698,2835,2693,2504,2572,2605
100,Education,888,945,1046,1068,1074,1051,1207
108,Personal insurance and pensions,5270,5336,5605,5471,5373,5424,5591
145,Food,117,93,97,96,89,84,104
147,Housing,264,225,229,202,199,194,191


In [63]:
main=main_df.drop([145,147,155,162,163,164,169])

In [65]:
main

Unnamed: 0,Item,2006,2007,2008,2009,2010,2011,2012
1,Food,6111,6133,6443,6372,6129,6458,6599
31,Housing,16366,16920,17109,16895,16557,16803,16887
60,Apparel and services,1874,1881,1801,1725,1700,1740,1736
71,Transportation,8508,8758,8604,7658,7677,8293,8998
84,Health care,2766,2853,2976,3126,3157,3313,3556
90,Entertainment,2377,2698,2835,2693,2504,2572,2605
100,Education,888,945,1046,1068,1074,1051,1207
108,Personal insurance and pensions,5270,5336,5605,5471,5373,5424,5591
