In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.preprocessing import scale
plt.style.use('seaborn-notebook')
%matplotlib notebook

In [None]:
raw_data = load_iris()
variables = raw_data['feature_names']
data = pd.DataFrame(raw_data['data'], columns=raw_data['feature_names'])
data['target'] = raw_data['target']
data

In [None]:
print(variables)

In [None]:
print(set(data['target'].values))
flower_names = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}

In [None]:
# Plot the raw data in 3D:
fig1 = plt.figure(constrained_layout=True)
ax1 = fig1.add_subplot(111, projection='3d')
# Plot the three classes of flowers with different colors:
class0 = data.loc[data['target'] == 0]
class1 = data.loc[data['target'] == 1]
class2 = data.loc[data['target'] == 2]

x = 'sepal length (cm)'
y = 'petal length (cm)'
z = 'petal width (cm)'

for klass in (class0, class1, class2):
    ax1.scatter(
        klass[x].values,
        klass[y].values,
        klass[z].values,
        s=150,
        label=flower_names[klass['target'].values[0]]
    )
ax1.legend()
ax1.set_xlabel(x, labelpad=10);
ax1.set_ylabel(y, labelpad=10);
ax1.set_zlabel(z, labelpad=10);

In [None]:
# Run Principal Component Analysis:
pca = PCA()
X = scale(data[variables].values)
scores = pca.fit_transform(X)
# "scores" are the new coordinates!

In [None]:
# We can plot the scores agains each other:
fig2, ax2 = plt.subplots(constrained_layout=True)
ax2.scatter(scores[:, 0], scores[:, 1])
ax2.set(xlabel='Principal component 1', ylabel='Principal component 2');

In [None]:
# We can plot the scores agains each other:
fig3, ax3 = plt.subplots(constrained_layout=True)
# Let us add colors according to the original classes:
idx0 = data.index[data['target'] == 0].tolist()
idx1 = data.index[data['target'] == 1].tolist()
idx2 = data.index[data['target'] == 2].tolist()
ax3.scatter(scores[idx0, 0], scores[idx0, 1], label=flower_names[0])
ax3.scatter(scores[idx1, 0], scores[idx1, 1], label=flower_names[1])
ax3.scatter(scores[idx2, 0], scores[idx2, 1], label=flower_names[2])
ax3.legend()
ax3.set(xlabel='Principal component 1', ylabel='Principal component 2');

In [None]:
# Loadings represents the transformation from the old axes to the new ones.
# We can get information on the loadings by inspecting the pca object:
loadings = pca.components_
loadings

In [None]:
# Here, each row is a single principal component, for instance:
pc1 = pca.components_[0, :]
print(pc1)

In [None]:
# These numbers, i.e. from print(pc1), define how we construct the first principal component:
text = ' + '.join([f'{i:4.2f} * ({j})' for i, j in zip(pc1, variables)])
print(f'pc1 = {text}')

In [None]:
# Let us try to add this line to the original 3D plot
# It here a vector pointing in the direction [0.52, 0.58, 0.56]
fig4 = plt.figure(constrained_layout=True)
ax4 = fig4.add_subplot(111, projection='3d')

x = 'sepal length (cm)'
y = 'petal length (cm)'
z = 'petal width (cm)'

for i, klass in enumerate((idx0, idx1, idx2)):
    ax4.scatter(
        X[klass, 0],
        X[klass, 2],
        X[klass, 3],
        s=150,
        label=flower_names[i],
    )
    
direction = np.array([0.52, 0.58, 0.56])
origin = np.array([0, 0, 0])

vector1 = origin + 3*direction
vector2 = origin - 3*direction

ax4.plot(
    [vector2[0], vector1[0]],
    [vector2[1], vector1[1]],
    [vector2[2], vector1[2]],
    color='k', lw=4
)
ax4.set_xlabel(x, labelpad=10);
ax4.set_ylabel(y, labelpad=10);
ax4.set_zlabel(z, labelpad=10);
ax4.legend();

In [None]:
# Let us look at the two first principal components:
pc1 = pca.components_[0, :]
pc2 = pca.components_[1, :]
print(f'PC1 = {pc1}')
print(f'PC2 = {pc2}')
text1 = ' + '.join([f'{i:4.2f} * ({j})' for i, j in zip(pc1, variables)])
print(f'PC1 = {text1}')
text2 = ' + '.join([f'{i:4.2f} * ({j})' for i, j in zip(pc2, variables)])
print(f'PC2 = {text2}')

In [None]:
# We can plot contributions from the original variables to the new ones in a loadings plot:
fig5, ax5 = plt.subplots(constrained_layout=True)
markers = ['o', 'X', 's', '>']
for i, name in enumerate(variables):
    ax5.scatter(pc1[i], pc2[i], s=150, label=name, marker=markers[i], edgecolor='k', linewidth=1.25)
ax5.set_xlim(-1, 1)
ax5.set_ylim(-1, 1)
ax5.axhline(y=0, ls=':', color='k')
ax5.axvline(x=0, ls=':', color='k')
ax5.set(xlabel='PC1', ylabel='PC2')
ax5.legend();

From this plot we can learn the following:

* We see that the petal length and the petal width are close together. This means that they are correlated. The practical implication of this is: *we only need one of them for describing the variation in the data (not both)*.

* Along PC2, the sepal width has a much larger value than the others. This means that PC2 is largely determined by the sepal width.

* Along PC1, the sepal length, the petal length, and the petal width have almost the same contribution (i.e. they are close together viewed along PC1). That means that they are all correlated here and we can probably explain a lot of the variance using just one of them.

Based on this we expect that we can explain a lot of the variance in the data using just two variables,
for instance, **sepal width** and **petal length**. Of course, this is something we could have figured out just by plotting all pairs of possible variables. Here there are only four variables and the number of such plots would then be $\binom{4}{2} = 6$. This is something we can manage. But if we have many variables, it will be tedious to do!

Let us test this conclusion by plotting the original data using the two original variables petal length and sepal width:

In [None]:
fig6, ax6 = plt.subplots(constrained_layout=True)

x = 'petal length (cm)'
y = 'sepal width (cm)'

for i, klass in enumerate((class0, class1, class2)):
    ax6.scatter(
        klass[x].values,
        klass[y].values,
        s=100,
        label=flower_names[i],
    )
ax6.legend()
ax6.set_xlabel(x);
ax6.set_ylabel(y);

In [None]:
# In the previous plot, we used two variables. But they are not necessarily equally important for
# describing the variance in the original data. We can check their importance by plotting the explained
# variance as a function of the number of principal components:
fig7, ax7 = plt.subplots(constrained_layout=True)
ax7.plot([1, 2, 3, 4], pca.explained_variance_ratio_, marker='X')
ax7.set(xlabel='Principal component', ylabel='Fraction of explained variance');
fig7.savefig('variance.pdf', bbox_inches='tight')