In [None]:
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as stats
from scipy import linalg as LA

data = np.genfromtxt("assp.csv", delimiter=",")
# Just give each column a random name
collumns = ["a", "b", "c", "d", "e", "f"]
print(collumns)

num_variables = len(collumns)

assert num_variables == len(data[0])

## Examine data
* As usual, step 1 is to apply uni/bi-variate methods:

In [None]:
for i in range(1, num_variables):
    plt.plot(data[:, 0], data[:, i], ".", label=collumns[i])
plt.xlabel(collumns[0])
plt.legend(loc="lower left")
plt.title("Each variable as a function of a")
plt.yscale("log")

In [None]:
fig, [ax1, ax2] = plt.subplots(ncols=2, width_ratios=[2, 1])
ax1.violinplot(data[:, 0:4], showmeans=True)
ax1.set_xticks([1, 2, 3, 4], collumns[:4])

ax2.violinplot(data[:, 4:], showmeans=True)
ax2.set_xticks([1, 2], collumns[4:])

plt.suptitle("Violin plot, showing means")
plt.show()

In [None]:
def corner_plot(data, labels, title):
    num_variables = len(labels)
    fig, axs = plt.subplots(nrows=num_variables, ncols=num_variables, figsize=(7, 7))
    for i in range(num_variables):
        for j in range(num_variables):

            # Only plot unique lower triangle
            if j > i:
                axs[i, j].set_visible(False)
                continue

            # Plot the data

            # Scatter plot for when x is not y
            if i != j:
                axs[i, j].plot(data[:, j], data[:, i], ".")

            # When x=y the plot would just be a straight line
            # It's common to plot histgoram (or box plot, etc)
            else:
                axs[i, j].hist(data[:, i], density=True, alpha=0.4)
                x = np.linspace(min(data[:, i]), max(data[:, i]), 100)
                kde = stats.gaussian_kde(data[:, i])
                axs[i, j].plot(x, kde(x), "b")
                axs[i, j].fill_between(x, kde(x), alpha=0.6)
                axs[i, j].set_yticks([])

            # Add title (mean and sem) above diagonal elements:
            if i == j:
                mean = np.mean(data[:, i])
                sem = stats.sem(data[:, i], ddof=1)
                axs[i, j].set_title(
                    f"{labels[j]}\n{mean:.1f}$\\pm${sem:.1f}", fontsize=11
                )

            # Add x labels only to last row
            if i == num_variables - 1:
                axs[i, j].set_xlabel(labels[j], fontsize=12)
            else:
                axs[i, j].set_xticks([])

            # Add y labels only to first column
            if j == 0:
                axs[i, j].set_ylabel(labels[i], fontsize=12)
            else:
                axs[i, j].set_yticks([])

    plt.suptitle(title)
    fig.align_ylabels(axs[:, 0])
    return fig, axs


fig, axs = corner_plot(data, collumns, "Corner plot (raw data)")
plt.show()

In [None]:
# Gives the mean of each column:
column_means = np.mean(data, axis=0)

# Change number of digits printed
np.set_printoptions(precision=3)

print(column_means)

## Principal Component analysis

### 1. Calculate covariance matrix

**Reminder**

Variance (sample variance):

$$
  \sigma^2(X) = \langle{(X-\bar X)^2}\rangle = \sum_i \frac{(X-\bar X)^2}{N-1}
$$

**Co**-variance

$$
  {\rm cov}(X,Y) = \langle{(X-\bar X)(Y-\bar Y)}\rangle = \sum_i \frac{(X-\bar X)(Y-\bar Y)}{N-1}
$$


$$
  {\rm cov}(X,X) = \sigma^2(X)
$$


**Normalised** covariance (also called correlation coefficent)

$$
  {\rm ncov}(X,Y) = \frac{{\rm cov}(X,Y)}{\sqrt{\sigma^2(X)\,\sigma^2(Y)}}
$$

Note that the normalised covariance is related to the autocorrelation function:
$$
  {\rm ncov}\Big(X(t),\,X(t+\Delta t)\Big) = {\rm ACF}(\Delta t)
$$

In [None]:
# Calculate covariance matrix
# https://numpy.org/doc/stable/reference/generated/numpy.cov.html
# If rowvar is True (default), then each row represents a variable

cov = np.cov(data, rowvar=False)

print("Covariance matrix:")
print(cov)


column_variances = np.var(data, axis=0, ddof=1)
print("\nVariances of each collumn (a,b,c..):")
print(column_variances)

In [None]:
# "normalised" covariance (or correlation coeficient)
# corrcoef = R_ij := C_ij / Sqrt( C_ii*C_jj )
ncov = np.corrcoef(data, rowvar=False)

print("\nNormalised Covariance matrix:")
print(ncov)

### 2. Find principal components: eigenvalue problem

In [None]:
# We can use eigh (instead of eig) since covariance is symmetric
evals, evecs = LA.eigh(ncov)

# Unsorted e.vals:
print(evals)


# The eigenvalues are not guarenteed to be in any specific order
# It's nice to have them sorted
# Must use argsort since we need to sort eigenvectors in same way
idx = np.argsort(evals)[::-1]
print("order:", idx)

# Sort the e.vals and e.vectors:
evals = evals[idx]
evecs = evecs[:, idx]

print("\nEigenvalues:")
print(evals)

# Eigenvalues are the relative contibutions
contributions = evals / np.sum(evals)

print("\nComponent contributions:")
print(contributions)

print("\nCumulative component contributions:")
print(np.cumsum(contributions))

In [None]:
print("Eigenvectors:")
print(evecs)

print("\nPrincipal components:")
for n in range(num_variables):
    print(
        f"PCA({n}): ",
        "".join([f" {evecs[i,n]:+.2f}*{collumns[i]}" for i in range(num_variables)]),
    )


# Do matrix multiplication to tranform data to PCA axis:
pca_data = np.dot(data, evecs)

In [None]:
plt.plot(pca_data[:, 0], pca_data[:, 1], "x")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("First two princ. components")
plt.show()

### Put it all together in one function

In [None]:
def pca(t_data, normalise=True, sub_mean=False):
    """Performs PCA. Returns transformed data, and contributions.
    If normalise=True (default), will used normalised covariance.
    If sub_mean=True, will subtract mean from each collumn before PCA.
    """
    import numpy as np
    from scipy import linalg as la

    # t_data -= np.mean(t_data, axis=0)
    # t_data /= np.std(t_data, axis=0, ddof=1)

    cov = (
        np.corrcoef(t_data, rowvar=False) if normalise else np.cov(t_data, rowvar=False)
    )

    evals, evecs = la.eigh(cov)

    # Sort eigen values/vectors
    idx = np.argsort(evals)[::-1]
    evals = evals[idx]
    evecs = evecs[:, idx]

    # Contributions to variance:
    contributions = evals / np.sum(evals)

    mean = np.mean(t_data, axis=0) if sub_mean else 0

    # PCA transform:
    pca_data = np.dot(t_data - mean, evecs)

    return pca_data, contributions

In [None]:
pc_labels = ["p1", "p2", "p3", "p4", "p5", "p6"]

pca_data, contibutions = pca(data, normalise=True, sub_mean=False)

print("\nComponent contributions:")
print(contributions)

print("\nCumulative component contributions:")
print(np.cumsum(contributions))

fig, [ax1, ax2] = plt.subplots(ncols=2)

for i in range(1, num_variables):
    ax1.plot(pca_data[:, 0], pca_data[:, i], ".", label=pc_labels[i])
ax1.set_xlabel(pc_labels[0])
ax1.legend(loc="lower left")

for i in range(1, 4):
    ax2.plot(pca_data[:, 0], pca_data[:, i], ".", label=pc_labels[i])
ax2.set_xlabel(pc_labels[0])
ax2.legend(loc="lower left")

plt.suptitle("Each variable as a function of p1")

plt.show()

In [None]:
fig, axs = corner_plot(pca_data, pc_labels, "Corner plot (PCA components)")
plt.show()

In [None]:
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler


scaled_data = StandardScaler(with_mean=False).fit_transform(data)

sk_pca = decomposition.PCA()
sk_pca_data = sk_pca.fit_transform(scaled_data)

# print("sklearn: Explained variance fraction:")
# print(sk_pca.components_)
# print(evecs)


print("sklearn: Explained variance fraction:")
print(sk_pca.explained_variance_ratio_)


for i in range(1, num_variables):
    plt.plot(pca_data[:, 0], pca_data[:, i], ".", label=pc_labels[i])
plt.xlabel(pc_labels[0])
plt.legend(loc="lower left")

plt.title("Each variable as a function of p1")

plt.show()

fig, axs = corner_plot(sk_pca_data, pc_labels, "Corner plot (PCA components)")
plt.show()