In [53]:
import pandas as pd

# Paths to the Excel files
file_paths = {
    "2019": 'excel files/2019.xlsx',
    "2020": 'excel files/2020.xlsx',
    "2021": 'excel files/2021.xlsx',
    "2022": 'excel files/2022.xlsx'
}

# Loading data for each year
data_2019 = pd.read_excel(file_paths["2019"])
data_2020 = pd.read_excel(file_paths["2020"])
data_2021 = pd.read_excel(file_paths["2021"])
data_2022 = pd.read_excel(file_paths["2022"])

In [54]:
# Add 'Year' column to each DataFrame
data_2019['Year'] = 2019
data_2020['Year'] = 2020
data_2021['Year'] = 2021
data_2022['Year'] = 2022

In [55]:
# Concatenate all DataFrames
all_data = pd.concat([data_2019, data_2020, data_2021, data_2022], ignore_index=True)

all_data

Unnamed: 0,Item,All\nconsumer\nunits,Birth year\nof 1997\nor later,Birth year\nfrom 1981\nto 1996,Birth year\nfrom 1965\nto 1980,Birth year\nfrom 1946\nto 1964,Birth year\nof 1945\nor earlier,Year
0,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0,2019
1,,,,,,,,2019
2,Consumer unit characteristics:,,,,,,,2019
3,,,,,,,,2019
4,Income before taxes,,,,,,,2019
...,...,...,...,...,...,...,...,...
2835,,,,,,,,2022
2836,,,,,,,,2022
2837,,,,,,,,2022
2838,,,,,,,,2022


In [56]:
# Assuming 'all_data' is your DataFrame and 'item' is the column you're checking for NaN values
cleaned_data = all_data.dropna(subset=['Item'])
cleaned_data.head(10)

Unnamed: 0,Item,All\nconsumer\nunits,Birth year\nof 1997\nor later,Birth year\nfrom 1981\nto 1996,Birth year\nfrom 1965\nto 1980,Birth year\nfrom 1946\nto 1964,Birth year\nof 1945\nor earlier,Year
0,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0,2019
2,Consumer unit characteristics:,,,,,,,2019
4,Income before taxes,,,,,,,2019
5,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0,2019
6,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26,2019
7,CV(%),2.38,9.35,2.23,1.64,5.69,5.01,2019
8,Income after taxes,,,,,,,2019
9,Mean,71487.0,26565.0,70565.0,90964.0,72201.0,40323.0,2019
10,SE,1312.27,2364.52,1449.17,1339.89,3109.51,1620.96,2019
11,CV(%),1.84,8.9,2.05,1.47,4.31,4.02,2019


In [57]:
all_data_columns_renamed = cleaned_data.rename(columns={
    'All\nconsumer\nunits': 'All Consumer Units',
    'Birth year\nof 1997\nor later': 'Gen Z',
    'Birth year\nfrom 1981\nto 1996': 'Millennials',
    'Birth year\nfrom 1965\nto 1980': 'Gen X',
    'Birth year\nfrom 1946\nto 1964': 'Baby Boomers',
    'Birth year\nof 1945\nor earlier': 'Silent Generation'
    })

all_data_columns_renamed

Unnamed: 0,Item,All Consumer Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation,Year
0,Number of consumer units (in thousands),132242.00,3698.00,33033.00,35498.00,43148.00,16865.00,2019
2,Consumer unit characteristics:,,,,,,,2019
4,Income before taxes,,,,,,,2019
5,Mean,82852.00,27779.00,79514.00,106173.00,86251.00,43680.00,2019
6,SE,1973.48,2597.40,1771.38,1737.66,4904.74,2188.26,2019
...,...,...,...,...,...,...,...,...
2827,Estimated monthly rental value of owned home,,,,,,,2022
2828,Mean,1401.00,355.00,1098.00,1644.00,1585.00,1526.00,2022
2829,SE,16.87,48.82,28.08,29.18,29.65,47.52,2022
2830,RSE,1.20,13.75,2.56,1.78,1.87,3.11,2022


In [58]:
# Ensure 'Year' is the first column
column_order = ['Year'] + [col for col in all_data_columns_renamed.columns if col != 'Year']
all_data_reordered = all_data_columns_renamed[column_order]

# Display the reordered and reindexed DataFrame
all_data_reordered.head()


Unnamed: 0,Year,Item,All Consumer Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation
0,2019,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0
2,2019,Consumer unit characteristics:,,,,,,
4,2019,Income before taxes,,,,,,
5,2019,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0
6,2019,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26


In [59]:
# Define the subset of columns to check for null values (all columns except 'Year')
subset_columns = [col for col in all_data_reordered.columns if col != 'Year']

# Drop rows where all specified columns are null
all_data_drop_blanks = all_data_reordered.dropna(how='all', subset=subset_columns)
# Resetting the index
all_data_drop_blanks.reset_index(drop=True, inplace=True)

# Display the cleaned data
all_data_drop_blanks.head(25)


Unnamed: 0,Year,Item,All Consumer Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation
0,2019,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0
1,2019,Consumer unit characteristics:,,,,,,
2,2019,Income before taxes,,,,,,
3,2019,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0
4,2019,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26
5,2019,CV(%),2.38,9.35,2.23,1.64,5.69,5.01
6,2019,Income after taxes,,,,,,
7,2019,Mean,71487.0,26565.0,70565.0,90964.0,72201.0,40323.0
8,2019,SE,1312.27,2364.52,1449.17,1339.89,3109.51,1620.96
9,2019,CV(%),1.84,8.9,2.05,1.47,4.31,4.02


In [60]:
import numpy as np

# Initialize columns for 'Main Category' and 'Subcategory' in 'all_data_drop_blanks'
all_data_drop_blanks['Main Category'] = np.nan
all_data_drop_blanks['Subcategory'] = np.nan

main_category = None
current_subcategory = None
for i in range(len(all_data_drop_blanks)):
    if pd.isnull(all_data_drop_blanks.loc[i, 'All Consumer Units']):
        if main_category is None or (i+1 < len(all_data_drop_blanks) and pd.isnull(all_data_drop_blanks.loc[i+1, 'All Consumer Units'])):
            # Current row is identified as a main category
            main_category = all_data_drop_blanks.loc[i, 'Item']
            current_subcategory = None  # Reset subcategory for a new main category
        else:
            # Current row is identified as a subcategory
            current_subcategory = all_data_drop_blanks.loc[i, 'Item']
    all_data_drop_blanks.loc[i, 'Main Category'] = main_category
    if current_subcategory is not None:
        all_data_drop_blanks.loc[i, 'Subcategory'] = current_subcategory
    else:
        all_data_drop_blanks.loc[i, 'Subcategory'] = all_data_drop_blanks.loc[i, 'Item']  # Use item as subcategory if no subcategory defined

In [61]:
# For rows not considered subcategories, fill 'Subcategory' with 'Item'
all_data_drop_blanks['Subcategory'].fillna(all_data_drop_blanks['Item'], inplace=True)

In [62]:
# Optionally, to clean up, fill remaining NaNs in 'Main Category'
all_data_drop_blanks['Main Category'].fillna(method='ffill', inplace=True)

In [63]:
# Display the DataFrame to verify the changes
all_data_drop_blanks[['Year', 'Item', 'All Consumer Units', 'Main Category', 'Subcategory']]
all_data_drop_blanks.head(25)

Unnamed: 0,Year,Item,All Consumer Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation,Main Category,Subcategory
0,2019,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0,,Number of consumer units (in thousands)
1,2019,Consumer unit characteristics:,,,,,,,Consumer unit characteristics:,Consumer unit characteristics:
2,2019,Income before taxes,,,,,,,Consumer unit characteristics:,Income before taxes
3,2019,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0,Consumer unit characteristics:,Income before taxes
4,2019,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26,Consumer unit characteristics:,Income before taxes
5,2019,CV(%),2.38,9.35,2.23,1.64,5.69,5.01,Consumer unit characteristics:,Income before taxes
6,2019,Income after taxes,,,,,,,Consumer unit characteristics:,Income after taxes
7,2019,Mean,71487.0,26565.0,70565.0,90964.0,72201.0,40323.0,Consumer unit characteristics:,Income after taxes
8,2019,SE,1312.27,2364.52,1449.17,1339.89,3109.51,1620.96,Consumer unit characteristics:,Income after taxes
9,2019,CV(%),1.84,8.9,2.05,1.47,4.31,4.02,Consumer unit characteristics:,Income after taxes


In [66]:
new_order = ['Year', 'Main Category', 'Subcategory', 'Item', 'All Consumer Units', 
             'Gen Z', 'Millennials', 'Gen X', 'Baby Boomers', 'Silent Generation']

# Reorder the DataFrame
all_data_drop_blanks = all_data_drop_blanks[new_order]

all_data_drop_blanks.head(30)

Unnamed: 0,Year,Main Category,Subcategory,Item,All Consumer Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation
0,2019,,Number of consumer units (in thousands),Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0
1,2019,Consumer unit characteristics:,Consumer unit characteristics:,Consumer unit characteristics:,,,,,,
2,2019,Consumer unit characteristics:,Income before taxes,Income before taxes,,,,,,
3,2019,Consumer unit characteristics:,Income before taxes,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0
4,2019,Consumer unit characteristics:,Income before taxes,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26
5,2019,Consumer unit characteristics:,Income before taxes,CV(%),2.38,9.35,2.23,1.64,5.69,5.01
6,2019,Consumer unit characteristics:,Income after taxes,Income after taxes,,,,,,
7,2019,Consumer unit characteristics:,Income after taxes,Mean,71487.0,26565.0,70565.0,90964.0,72201.0,40323.0
8,2019,Consumer unit characteristics:,Income after taxes,SE,1312.27,2364.52,1449.17,1339.89,3109.51,1620.96
9,2019,Consumer unit characteristics:,Income after taxes,CV(%),1.84,8.9,2.05,1.47,4.31,4.02


In [67]:
# Assuming 'all_data' is your DataFrame and 'item' is the column you're checking for NaN values
Final_Data = all_data_drop_blanks.dropna(subset=['All Consumer Units'])
Final_Data.head(30)

Unnamed: 0,Year,Main Category,Subcategory,Item,All Consumer Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation
0,2019,,Number of consumer units (in thousands),Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0
3,2019,Consumer unit characteristics:,Income before taxes,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0
4,2019,Consumer unit characteristics:,Income before taxes,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26
5,2019,Consumer unit characteristics:,Income before taxes,CV(%),2.38,9.35,2.23,1.64,5.69,5.01
7,2019,Consumer unit characteristics:,Income after taxes,Mean,71487.0,26565.0,70565.0,90964.0,72201.0,40323.0
8,2019,Consumer unit characteristics:,Income after taxes,SE,1312.27,2364.52,1449.17,1339.89,3109.51,1620.96
9,2019,Consumer unit characteristics:,Income after taxes,CV(%),1.84,8.9,2.05,1.47,4.31,4.02
10,2019,Consumer unit characteristics:,Income after taxes,Age of reference person,51.6,20.2,30.7,46.2,63.1,80.8
12,2019,Consumer unit characteristics:,Average number in consumer unit:,People,2.5,1.8,2.8,3.1,2.1,1.6
13,2019,Consumer unit characteristics:,Average number in consumer unit:,Children under 18,0.6,0.3,1.0,1.0,0.2,0.0
