# Ch 8 - Dimensionality Reduction

This the the Python code for this chapter. See the associated notes for more details.

## Principal Component Analysis

As a simple example, we can use NumPy to compute the principal components for us.

In [1]:
import numpy as np

# Creating a toy data set

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

PCA requires the data set to be centered about the origin, so we do this now manually. In SKL, the PCA class will do this for us automatically.

In [9]:
X_centered = X - X.mean(axis=0)

In [22]:
X_centered

array([[-1.03976771e+00, -7.60238460e-01, -3.32880482e-01],
       [-3.17841939e-02,  3.90260570e-01, -3.64766659e-02],
       [-9.77238797e-01, -6.73862060e-01, -3.20757101e-01],
       [-9.44190485e-01,  7.70779228e-04, -4.97304144e-02],
       [-7.87164831e-01, -5.10641487e-02,  1.19970744e-01],
       [ 1.09409378e+00,  1.15762056e-01,  2.45551498e-01],
       [-1.04665623e+00, -8.53165791e-01, -2.05241169e-01],
       [ 6.49452398e-01, -4.82750342e-01, -7.94325731e-02],
       [ 9.92128132e-01,  3.06140931e-01,  3.96278747e-01],
       [ 5.25509785e-01,  4.67955007e-01,  1.62461684e-01],
       [-1.01367188e+00, -2.00458976e-01, -1.93074561e-01],
       [ 1.10841362e+00,  7.29745189e-02, -1.82449496e-03],
       [-1.01744457e+00, -4.77653389e-01, -2.29165228e-01],
       [-9.71704237e-01, -7.08910047e-01, -2.10833327e-01],
       [ 1.07688965e+00, -3.86770525e-02,  2.63501050e-02],
       [-3.70113351e-01,  2.44018985e-01, -7.21578839e-03],
       [ 6.66958762e-01, -4.82702763e-01

In [18]:
# Training instances for first feature

X_centered[:,0]

array([-1.03976771, -0.03178419, -0.9772388 , -0.94419049, -0.78716483,
        1.09409378, -1.04665623,  0.6494524 ,  0.99212813,  0.52550978,
       -1.01367188,  1.10841362, -1.01744457, -0.97170424,  1.07688965,
       -0.37011335,  0.66695876,  0.65896157,  0.85333565, -1.08496872,
        0.49929899, -0.96826148, -0.99569625, -1.12967405,  1.15080196,
        0.88930378, -0.31095574,  1.07840518, -0.95988339,  0.63765426,
        0.07489424, -1.10509902,  0.00597516,  0.95439882,  0.94850707,
       -0.09331005, -0.00716575, -0.42951228,  0.61756082, -0.664188  ,
        0.70857275, -0.47406479, -1.0086781 ,  1.09532478,  0.76034321,
        0.14229753, -0.69778492,  1.09022918, -0.1378834 , -0.4787847 ,
       -1.18197222,  0.81087359, -0.49347043,  1.12439206,  0.21633176,
        1.08160954, -1.03558753,  0.50112667, -1.11982458,  0.56285699])

In [29]:
# Verifying that each feature column is indeed zero-centered (ie. mean is 0)

for i in range(3):
    print(np.mean(X_centered[:,i]))

-5.551115123125783e-18
-4.625929269271485e-18
-1.850371707708594e-17


Now we use NumPy to compute the SVD for us:

In [10]:
U, sigma, Vt = np.linalg.svd(X_centered)

Finally, we can extract the principal components from the V matrix. Note that the output of an SVD will yield V transpose, so you must transpose it once more.

In [25]:
c1 = Vt.T[:,0]
c2 = Vt.T[:,1]
c3 = Vt.T[:,2]

In [26]:
print("First PC: ", c1)
print("Second PC: ", c2)
print("Third PC: ", c3)

First PC:  [0.93636116 0.29854881 0.18465208]
Second PC:  [-0.34027485  0.90119108  0.2684542 ]
Third PC:  [-0.08626012 -0.31420255  0.94542898]
