In [None]:
# Cell 1: Manual PCA on Iris dataset (following the 5 steps)

import numpy as np
from sklearn.datasets import load_iris

# 1) Load data
iris = load_iris()
X = iris.data   # shape (150, 4)

# 1. Data Standardization: Z = (X - μ) / σ
mu = X.mean(axis=0)
sigma = X.std(axis=0, ddof=1)  # sample std
Z = (X - mu) / sigma

# 2. Covariance Matrix
C = np.cov(Z, rowvar=False)    # shape (4, 4)

# 3. Eigenvalues and Eigenvectors of covariance matrix
eigvals, eigvecs = np.linalg.eig(C)

# 4. Sort eigenvalues (descending) and build feature transformation matrix W (take k=2)
idx = np.argsort(eigvals)[::-1]     # indices of eigenvalues sorted descending
k = 2
W = eigvecs[:, idx[:k]]             # shape (4, 2)

# 5. Transform data: Z_reduced = Z * W
Z_reduced = Z @ W                   # shape (150, 2)

print("Mean (μ):", mu)
print("Std (σ):", sigma)
print("\nCovariance matrix C:\n", C)
print("\nEigenvalues:\n", eigvals)
print("\nEigenvectors (columns):\n", eigvecs)
print("\nSorted eigenvalues (desc):\n", eigvals[idx])
print("\nTransformation matrix W (top 2 eigenvectors):\n", W)
print("\nReduced data Z_reduced shape:", Z_reduced.shape)
print("\nFirst 5 rows of Z_reduced:\n", Z_reduced[:5])


Mean (μ): [5.84333333 3.05733333 3.758      1.19933333]
Std (σ): [0.82806613 0.43586628 1.76529823 0.76223767]

Covariance matrix C:
 [[ 1.         -0.11756978  0.87175378  0.81794113]
 [-0.11756978  1.         -0.4284401  -0.36612593]
 [ 0.87175378 -0.4284401   1.          0.96286543]
 [ 0.81794113 -0.36612593  0.96286543  1.        ]]

Eigenvalues:
 [2.91849782 0.91403047 0.14675688 0.02071484]

Eigenvectors (columns):
 [[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]

Sorted eigenvalues (desc):
 [2.91849782 0.91403047 0.14675688 0.02071484]

Transformation matrix W (top 2 eigenvectors):
 [[ 0.52106591 -0.37741762]
 [-0.26934744 -0.92329566]
 [ 0.5804131  -0.02449161]
 [ 0.56485654 -0.06694199]]

Reduced data Z_reduced shape: (150, 2)

First 5 rows of Z_reduced:
 [[-2.25714118 -0.47842383]
 [-2.07401302  0.67188269]
 [-2.35633511 

In [None]:
# Cell 2: PCA using sklearn (for quick check)

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize using sklearn (same idea as manual)
scaler = StandardScaler()
Z_sklearn = scaler.fit_transform(X)

# PCA with 2 components
pca = PCA(n_components=2)
Z_pca = pca.fit_transform(Z_sklearn)

print("Sklearn PCA – Explained variance ratio:", pca.explained_variance_ratio_)
print("Sklearn PCA result shape:", Z_pca.shape)
print("\nFirst 5 rows of sklearn PCA output:\n", Z_pca[:5])


Sklearn PCA – Explained variance ratio: [0.72962445 0.22850762]
Sklearn PCA result shape: (150, 2)

First 5 rows of sklearn PCA output:
 [[-2.26470281  0.4800266 ]
 [-2.08096115 -0.67413356]
 [-2.36422905 -0.34190802]
 [-2.29938422 -0.59739451]
 [-2.38984217  0.64683538]]
