In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Step 1: Drop rows with any NaN values
features = combined_features.dropna(axis=0, how="any")

# Step 2: No imputation, just keep the clean rows
features_imputed = features.copy()

# Step 3: Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_imputed)

# Step 4: Run PCA
pca = PCA()
pca.fit(X_scaled)

# Explained variance
explained_var = np.cumsum(pca.explained_variance_ratio_)

# Step 5: Choose number of components (≥90% variance explained)
threshold = 0.9
n_components = np.argmax(explained_var >= threshold) + 1
print(f"Keeping {n_components} components to explain ≥{threshold*100}% variance.")

# Step 6: Loadings
loadings = pd.DataFrame(
    pca.components_[:n_components].T,
    index=features.columns,
    columns=[f"PC{i+1}" for i in range(n_components)]
)

importance = (loadings**2).sum(axis=1).sort_values(ascending=False)

# Step 7: Select top features
top_features = importance.head(39)
print("\nTop features by PCA importance:")
print(top_features)

selected_features = features[top_features.index]