In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1) Load raw data
df = pd.read_csv('data/Covid_Deaths.csv')

df_counts = df.pivot_table(
    index=['jurisdiction_residence', 'year', 'month'],
    columns='subgroup1',
    values='COVID_deaths',
    aggfunc='sum',
    fill_value=0
)

# ------------------------------------------------
# 3) Optionally filter out any subgroup with zero variance
# ------------------------------------------------
df_counts = df_counts.loc[:, df_counts.var() > 0]

# ------------------------------------------------
# 4) Build preprocessing + PCA pipeline
#    - log1p transform to tame large counts
#    - standardize across subgroups
#    - PCA to 2 components
# ------------------------------------------------
pipeline = Pipeline([
    ('log1p', FunctionTransformer(np.log1p, validate=False)),
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=2))
])

# ------------------------------------------------
# 5) Fit & transform
# ------------------------------------------------
X_pca = pipeline.fit_transform(df_counts)

# ------------------------------------------------
# 6) Extract and display subgroup loadings for each PC
# ------------------------------------------------
pca = pipeline.named_steps['pca']
loadings = pca.components_.T  # shape: [n_subgroups, n_components]
loadings_df = pd.DataFrame(
    loadings,
    index=df_counts.columns,
    columns=['PC1', 'PC2']
)
print("Subgroup PCA Loadings:\n", loadings_df)

# ------------------------------------------------
# 7) Top contributors for each principal component
# ------------------------------------------------
n_top = 10
for pc in loadings_df.columns:
    print(f"\nTop {n_top} subgroups driving {pc}:")
    print(loadings_df[pc].abs().nlargest(n_top))

Subgroup PCA Loadings:
                                                          PC1       PC2
subgroup1                                                             
0-4 years                                           0.174124  0.466103
12-17 years                                         0.176159  0.470207
18-29 years                                         0.239282  0.082999
30-39 years                                         0.249790 -0.029962
40-49 years                                         0.254226 -0.094006
5-11 years                                          0.161058  0.492232
50-64 years                                         0.260892 -0.146720
65-74 years                                         0.265175 -0.137776
75 years and over                                   0.255493 -0.137069
Female                                              0.261675 -0.137262
Hispanic                                            0.249992 -0.124665
Male                                                0