# Week 6: Principal Component Analysis

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as stats
from scipy import linalg as LA

data = np.genfromtxt("assp.csv", delimiter=",")
# Just give each column a random name
collumns = ["a", "b", "c", "d", "e", "f"]
print(collumns)

num_variables = len(collumns)

assert num_variables == len(data[0])

## Examine data
* As usual, step 1 is to apply uni/bi-variate methods:

In [None]:
for i in range(1, num_variables):
    plt.plot(data[:, 0], data[:, i], ".", label=collumns[i])
plt.xlabel(collumns[0])
plt.legend(loc="lower left")
plt.title("Each variable as a function of a")
plt.yscale("log")

In [None]:
fig, [ax1, ax2] = plt.subplots(ncols=2, width_ratios=[2, 1])
ax1.violinplot(data[:, 0:4], showmeans=True)
ax1.set_xticks([1, 2, 3, 4], collumns[:4])

ax2.violinplot(data[:, 4:], showmeans=True)
ax2.set_xticks([1, 2], collumns[4:])

plt.suptitle("Violin plot, showing means")
plt.show()

In [None]:
# Prints a corner plot, with histograms along edges:
def corner_plot(data, labels, title):
    num_variables = len(labels)
    fig, axs = plt.subplots(nrows=num_variables, ncols=num_variables, figsize=(7, 7))
    for i in range(num_variables):
        for j in range(num_variables):

            # Only plot unique lower triangle
            if j > i:
                axs[i, j].set_visible(False)
                continue

            # Plot the data

            # Scatter plot for when x is not y
            if i != j:
                axs[i, j].plot(data[:, j], data[:, i], ".")

            # When x=y the plot would just be a straight line
            # It's common to plot histgoram (or box plot, etc)
            else:
                axs[i, j].hist(data[:, i], density=True, alpha=0.4)
                x = np.linspace(min(data[:, i]), max(data[:, i]), 100)
                kde = stats.gaussian_kde(data[:, i])
                axs[i, j].plot(x, kde(x), "b")
                axs[i, j].fill_between(x, kde(x), alpha=0.6)
                axs[i, j].set_yticks([])

            # Add title (mean and sem) above diagonal elements:
            if i == j:
                mean = np.mean(data[:, i])
                sem = stats.sem(data[:, i], ddof=1)
                axs[i, j].set_title(
                    f"{labels[j]}\n{mean:.1f}$\\pm${sem:.1f}", fontsize=11
                )

            # Add x labels only to last row
            if i == num_variables - 1:
                axs[i, j].set_xlabel(labels[j], fontsize=12)
            else:
                axs[i, j].set_xticks([])

            # Add y labels only to first column
            if j == 0:
                axs[i, j].set_ylabel(labels[i], fontsize=12)
            else:
                axs[i, j].set_yticks([])

    plt.suptitle(title)
    fig.align_ylabels(axs[:, 0])
    return fig, axs


fig, axs = corner_plot(data, collumns, "Corner plot (raw data)")
plt.show()

In [None]:
# Gives the mean of each column:
column_means = np.mean(data, axis=0)

# Change number of digits printed
np.set_printoptions(precision=3)

print(column_means)

## Principal Component analysis

### 1. Calculate covariance matrix

**Reminder**

Variance (sample variance):

$$
  \sigma^2(X) = \langle{(X-\bar X)^2}\rangle = \sum_i \frac{(X-\bar X)^2}{N-1}
$$

**Co**-variance

$$
  {\rm cov}(X,Y) = \langle{(X-\bar X)(Y-\bar Y)}\rangle = \sum_i \frac{(X-\bar X)(Y-\bar Y)}{N-1}
$$


$$
  {\rm cov}(X,X) = \sigma^2(X)
$$


**Normalised** covariance (also called correlation coefficent)

$$
  {\rm ncov}(X,Y) = \frac{{\rm cov}(X,Y)}{\sqrt{\sigma^2(X)\,\sigma^2(Y)}}
$$

Note that the normalised covariance is related to the autocorrelation function:
$$
  {\rm ncov}\Big(X(t),\,X(t+\Delta t)\Big) = {\rm ACF}(\Delta t)
$$

In [None]:
# Calculate covariance matrix
# https://numpy.org/doc/stable/reference/generated/numpy.cov.html
# If rowvar is True (default), then each row represents a variable

cov = np.cov(data, rowvar=False)

print("Covariance matrix:")
print(cov)


column_variances = np.var(data, axis=0, ddof=1)
print("\nVariances of each collumn (a,b,c..):")
print(column_variances)

In [None]:
# "normalised" covariance (or correlation coeficient)
# corrcoef = R_ij := C_ij / Sqrt( C_ii*C_jj )
ncov = np.corrcoef(data, rowvar=False)

print("\nNormalised Covariance matrix:")
print(ncov)

### 2. Find principal components: eigenvalue problem

In [None]:
# We can use eigh (instead of eig) since covariance is symmetric
evals, evecs = LA.eigh(ncov)

# Unsorted e.vals:
print(evals)


# The eigenvalues are not guarenteed to be in any specific order
# It's nice to have them sorted
# Must use argsort since we need to sort eigenvectors in same way
idx = np.argsort(evals)[::-1]
print("order:", idx)

# Sort the e.vals and e.vectors:
evals = evals[idx]
evecs = evecs[:, idx]

print("\nEigenvalues:")
print(evals)

# Eigenvalues are the relative contibutions
contributions = evals / np.sum(evals)

print("\nComponent contributions:")
print(contributions)

print("\nCumulative component contributions:")
print(np.cumsum(contributions))

In [None]:
print("Eigenvectors:")
print(evecs)

print("\nPrincipal components:")
for n in range(num_variables):
    print(
        f"PCA({n}): ",
        "".join([f" {evecs[i,n]:+.2f}*{collumns[i]}" for i in range(num_variables)]),
    )


# Do matrix multiplication to tranform data to PCA axis:
pca_data = np.dot(data, evecs)

In [None]:
fig, [ax1, ax2] = plt.subplots(ncols=2)
fig.tight_layout(pad=2)  # add some space

ax1.plot(pca_data[:, 0], pca_data[:, 1], "x")
ax1.set_xlabel("PC1")
ax1.set_ylabel("PC2")
ax1.set_title("First two princ. components")

ax2.plot(pca_data[:, 2], pca_data[:, 3], "x")
ax2.set_xlabel("PC3")
ax2.set_ylabel("PC4")
ax2.set_title("Next two princ. components")

plt.show()

------
### Put it all together in one function

In [None]:
def pca(t_data, normalise=True, sub_mean=False):
    """Performs PCA. Returns transformed data, contributions, princ. components (e-vectors), sum of eigenvalues.
    If normalise=True (default), will used normalised covariance.
    If sub_mean=True, will subtract mean from each collumn before PCA.
    """
    import numpy as np
    from scipy import linalg as la

    # Calculate covariance matrrix
    cov = (
        np.corrcoef(t_data, rowvar=False) if normalise else np.cov(t_data, rowvar=False)
    )

    # Gets the eigenvalues + eigenvectors
    evals, evecs = la.eigh(cov)

    # Sort eigen values/vectors
    idx = np.argsort(evals)[::-1]
    evals = evals[idx]
    evecs = evecs[:, idx]

    # Contributions to variance (just e-vals as fraction of total):
    contributions = evals / np.sum(evals)

    # Optionally, subtract mean before doing PCA
    mean = np.mean(t_data, axis=0) if sub_mean else 0

    # PCA transform:
    pca_data = np.dot(t_data - mean, evecs)

    return pca_data, contributions, evecs, np.sum(evals)

In [None]:
pc_labels = ["p1", "p2", "p3", "p4", "p5", "p6"]

pca_data, contibutions, _, _ = pca(data, normalise=True, sub_mean=False)

print("\nComponent contributions:")
print(contributions)

print("\nCumulative component contributions:")
print(np.cumsum(contributions))

fig, [ax1, ax2] = plt.subplots(ncols=2)

for i in range(1, num_variables):
    ax1.plot(pca_data[:, 0], pca_data[:, i], ".", label=pc_labels[i])
ax1.set_xlabel(pc_labels[0])
ax1.legend(loc="lower left")

for i in range(1, 4):
    ax2.plot(pca_data[:, 0], pca_data[:, i], ".", label=pc_labels[i])
ax2.set_xlabel(pc_labels[0])
ax2.legend(loc="lower left")

plt.suptitle("Each variable as a function of p1")

plt.show()

In [None]:
fig, axs = corner_plot(pca_data, pc_labels, "Corner plot (PCA components)")
plt.show()

### We can also use sklearn package (scikit-learn)

* sk_PCA = sklearn.decomposition.PCA() 
* sk_PCA.fit_transform(data)
* a) it doesn't normalise the covariance by default
    * we can achieve the same result by scaling the data first ourselves
    * scaled_data = StandardScaler().fit_transform(data)
    * (or just manually divide by the variances)
* b) it subtracts the mean by default

In [None]:
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler


scaled_data = StandardScaler(with_mean=False).fit_transform(data)
# scaled_data = data

sk_pca = decomposition.PCA()
sk_pca_data = sk_pca.fit_transform(scaled_data)
pc_labels = ["p1", "p2", "p3", "p4", "p5", "p6"]

print("sklearn: Explained variance fraction:")
print(sk_pca.explained_variance_ratio_)


for i in range(1, num_variables):
    plt.plot(sk_pca_data[:, 0], sk_pca_data[:, i], ".", label=pc_labels[i])
plt.xlabel(pc_labels[0])
plt.legend(loc="lower left")

plt.title("Each variable as a function of p1")

plt.show()

fig, axs = corner_plot(sk_pca_data, pc_labels, "Corner plot (PCA components)")
plt.show()

--------------
### Application to simplest case: 2D data

In [None]:
data_xy = np.genfromtxt("data_xy.csv", delimiter=",")

plt.plot(data_xy[:, 0], data_xy[:, 1], "x")
plt.xlabel("x")
plt.ylabel("y")
# plt.gca().set_aspect("equal")
plt.show()

In [None]:
pc_data_xy, p_contribs, p_components, evalsum = pca(
    data_xy, normalise=False, sub_mean=False
)

print(p_contribs)

plt.plot(pc_data_xy[:, 0], pc_data_xy[:, 1], "x")
plt.xlabel(f"pc1 = {p_components[0,0]:.2f}x + {p_components[1,0]:.2f}y")
plt.ylabel(f"pc2 = {p_components[0,1]:.2f}x + {p_components[1,1]:.2f}y")
plt.show()

In [None]:
plt.plot(data_xy[:, 0], data_xy[:, 1], "x")

means = np.mean(data_xy, axis=0)
v1 = np.array([means, means + np.sqrt(evalsum * p_contribs[0]) * p_components[:, 0]])
v2 = np.array([means, means + np.sqrt(evalsum * p_contribs[1]) * p_components[:, 1]])


plt.plot(means[0], means[1], "ro", label="mean", markersize=6)
plt.plot(v1[:, 0], v1[:, 1], "k-", label="PC 1", linewidth=3)
plt.plot(v2[:, 0], v2[:, 1], "k--", label="PC 2", linewidth=3)

# plt.gca().set_aspect('equal')

plt.xlabel("x")
plt.ylabel("y")
plt.title("Showing eigenvectors (scaled by sqrt[eval])")
plt.legend()
plt.show()

### A highly-redundant 4D data set:
* Demonstrates deminsion reduction

In [None]:
data_wxyz = np.genfromtxt("data_wxyz.csv", delimiter=",")
labels = ["w", "x", "y", "z"]

ax = plt.axes(projection="3d")
ax.set_box_aspect(aspect=None, zoom=0.83)
ax.stem(
    data_wxyz[:, 0][0::5], data_wxyz[:, 1][0::5], data_wxyz[:, 3][0::5], basefmt=" "
)
ax.set_xlabel("w")
ax.set_ylabel("x")
ax.set_zlabel("z")
ax.set_title("W,X,Z (no y), every 5th")
plt.show()

In [None]:
fig, axs = corner_plot(data_wxyz, labels, "WXYZ Data")
plt.show()

In [None]:
pc_data_wxyz, p_contribs, p_components, _ = pca(data_wxyz, False)


print(p_contribs)

# print(p_components)

pc_data_wxyz *= -1
p_components *= -1

plt.plot(pc_data_wxyz[:, 0], pc_data_wxyz[:, 1], "x")
plt.xlabel(
    f"{p_components[0,0]:.2f}w + {p_components[1,0]:.2f}x + {p_components[2,0]:.2f}y + {p_components[3,0]:.2f}z"
)
plt.ylabel(
    f"{p_components[0,1]:.2f}w + {p_components[1,1]:.2f}x + {p_components[2,1]:.2f}y + {p_components[3,1]:.2f}z"
)
plt.show()

fig, axs = corner_plot(pc_data_wxyz, ["p1", "p2", "p3", "p4"], "4D - PCA")
plt.show()

### Inverse transform: dimensionality reduction

* Inverse transform: simply invert the eigenvector matrix
* Dimensionality reduction: discard all but lowest few PCAs
* Inverse is slightly more tricky to code ourselves
* Simple with library

In [None]:
n_keep = 2
sk_pca = decomposition.PCA(n_components=n_keep)
# simply keeps only first 2 components (uses 2 eigenvalues)

sk_pca_data = sk_pca.fit_transform(data_wxyz)

print(sk_pca.explained_variance_ratio_)
print(sk_pca.n_components_)
print(sk_pca.components_)

fig, axs = corner_plot(sk_pca_data, pc_labels[:n_keep], "PCA Data")


data2 = sk_pca.inverse_transform(sk_pca_data)

fig, axs = corner_plot(data2, labels, "WXYZ Data, after D reduction")
plt.show()