In [48]:
import pandas as pd

# Paths to the Excel files
file_paths = {
    "2019": 'excel files/2019.xlsx',
    "2020": 'excel files/2020.xlsx',
    "2021": 'excel files/2021.xlsx',
    "2022": 'excel files/2022.xlsx'
}

# Loading data for each year
data_2019 = pd.read_excel(file_paths["2019"])
data_2020 = pd.read_excel(file_paths["2020"])
data_2021 = pd.read_excel(file_paths["2021"])
data_2022 = pd.read_excel(file_paths["2022"])

In [49]:
# Add 'Year' column to each DataFrame
data_2019['Year'] = 2019
data_2020['Year'] = 2020
data_2021['Year'] = 2021
data_2022['Year'] = 2022

In [50]:
# Concatenate all DataFrames
all_data = pd.concat([data_2019, data_2020, data_2021, data_2022])

all_data

Unnamed: 0,Item,All\nconsumer\nunits,Birth year\nof 1997\nor later,Birth year\nfrom 1981\nto 1996,Birth year\nfrom 1965\nto 1980,Birth year\nfrom 1946\nto 1964,Birth year\nof 1945\nor earlier,Year
0,Number of consumer units (in thousands),132242,3698,33033,35498,43148,16865,2019
1,,,,,,,,2019
2,Consumer unit characteristics:,,,,,,,2019
3,,,,,,,,2019
4,Income before taxes,,,,,,,2019
...,...,...,...,...,...,...,...,...
684,d No data reported.,,,,,,,2022
685,,,,,,,,2022
686,,,,,,,,2022
687,,,,,,,,2022


In [51]:
all_data_columns_renamed = all_data.rename(columns={
    'All\nconsumer\nunits': 'All_Consumer_Units',
    'Birth year\nof 1997\nor later': 'Gen Z',
    'Birth year\nfrom 1981\nto 1996': 'Millennials',
    'Birth year\nfrom 1965\nto 1980': 'Gen X',
    'Birth year\nfrom 1946\nto 1964': 'Baby Boomers',
    'Birth year\nof 1945\nor earlier': 'Silent Generation'
    })

all_data_columns_renamed

Unnamed: 0,Item,All_Consumer_Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation,Year
0,Number of consumer units (in thousands),132242,3698,33033,35498,43148,16865,2019
1,,,,,,,,2019
2,Consumer unit characteristics:,,,,,,,2019
3,,,,,,,,2019
4,Income before taxes,,,,,,,2019
...,...,...,...,...,...,...,...,...
684,d No data reported.,,,,,,,2022
685,,,,,,,,2022
686,,,,,,,,2022
687,,,,,,,,2022


In [52]:
# Ensure 'Year' is the first column
column_order = ['Year'] + [col for col in all_data_columns_renamed.columns if col != 'Year']
all_data_reordered = all_data_columns_renamed[column_order]

# Resetting the index
all_data_reordered.reset_index(drop=True, inplace=True)

# Display the reordered and reindexed DataFrame
all_data_reordered.head()


Unnamed: 0,Year,Item,All_Consumer_Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation
0,2019,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0
1,2019,,,,,,,
2,2019,Consumer unit characteristics:,,,,,,
3,2019,,,,,,,
4,2019,Income before taxes,,,,,,


In [54]:
# Define the subset of columns to check for null values (all columns except 'Year')
subset_columns = [col for col in all_data_reordered.columns if col != 'Year']

# Drop rows where all specified columns are null
all_data__drop_blanks = all_data_reordered.dropna(how='all', subset=subset_columns)

# Display the cleaned data
all_data__drop_blanks.head()


Unnamed: 0,Year,Item,All_Consumer_Units,Gen Z,Millennials,Gen X,Baby Boomers,Silent Generation
0,2019,Number of consumer units (in thousands),132242.0,3698.0,33033.0,35498.0,43148.0,16865.0
2,2019,Consumer unit characteristics:,,,,,,
4,2019,Income before taxes,,,,,,
5,2019,Mean,82852.0,27779.0,79514.0,106173.0,86251.0,43680.0
6,2019,SE,1973.48,2597.4,1771.38,1737.66,4904.74,2188.26
