In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load and prepare data
df: pd.DataFrame = pd.read_csv("./datasets/organizations.csv")
X: np.ndarray = df.select_dtypes(include=["number"]).values
n_samples, n_features = X.shape

# Center the data
X_centered: np.ndarray = X - X.mean(axis=0)

# Compute covariance matrix
cov_matrix: np.ndarray = X_centered.T @ X_centered / n_samples

# Eigen decomposition
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# Sort in descending order
idx: np.ndarray = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]

# Calculate variance explained
variance_explained: np.ndarray = eigenvalues / eigenvalues.sum()
cumulative_variance: np.ndarray = np.cumsum(variance_explained)

# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
ax1.bar(range(1, n_features + 1), variance_explained * 100)
ax1.set_xlabel("Principal Component")
ax1.set_ylabel("Variance Explained (%)")
ax1.set_title("Scree Plot")

# Cumulative variance
ax2.plot(range(1, n_features + 1), cumulative_variance * 100, "bo-")
ax2.axhline(80, color="r", linestyle="--", label="80%")
ax2.axhline(95, color="g", linestyle="--", label="95%")
ax2.set_xlabel("Number of Components")
ax2.set_ylabel("Cumulative Variance (%)")
ax2.set_title("Cumulative Variance Explained")
ax2.legend()
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
n_80: int = int(np.argmax(cumulative_variance >= 0.80) + 1)
n_95: int = int(np.argmax(cumulative_variance >= 0.95) + 1)

summary: pd.DataFrame = pd.DataFrame({
    "PC": range(1, n_features + 1),
    "Variance (%)": variance_explained * 100,
    "Cumulative (%)": cumulative_variance * 100
})

print(summary.round(2))
print(f"\nComponents for 80% variance: {n_80}")
print(f"Components for 95% variance: {n_95}")
print(f"Compression: {n_features} â†’ {n_80} ({(1-n_80/n_features)*100:.1f}%)")