In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy as sp

import utilityfunctions as uf

## Review

The steps to calculate PCA are:
* (If appropriate) normalize the variables to be in the range 0-1
* Center the data
* Compute the covariance matrix
* Compute the eigenvectors and eigenvalues; the eigenvectors tell us the direction of variance, and the eigenvalues tell us the amount of variance
* Get an ordering over the eigenvalues
* Sort the eigenvalues and eigenvectors accordingly
* Compute the proportional variance (how much bigger?) accounted for by each principal component
* Compute the cumulative sum of the proportional variance (tells us how many eigenvectors we need to explain a desired amount of variance)
* Examine the principal components. Select v' of them.
* Project the data into PCA space
* Reconstruct the data

## Today

Today, we are going to look at what happens when we preprocess the data in different ways before fitting a PCA.

So we will look again at the summary statistics for the data.

We will preprocess the data in four ways:
* center only
* minmax global
* minmax local
* zscore

In [None]:
# Things we need from Day 19

# Let's split off the labels
def split(data, ycol):
    y = data[:, ycol]
    xfirst = data[:, 0:ycol]
    xsecond = data[:, ycol+1:data.shape[1]]
    return (np.hstack((xfirst, xsecond)), y)

# center
def center(data):
    centered = data - np.mean(data, axis=0)
    return centered

# preprocess
def preprocess(data, minmax=False, local=False, zscore=False):
    if minmax == True and zscore == True:
        print("Nope, won't do that!")
        return data
    elif minmax == True:
        if local == False:
            data = uf.minmaxGlobal(data)
        else:
            data = uf.minmaxLocal(data)
        return center(data)
    elif zscore == True:
        return uf.zScore(data)   

In [None]:
# This is most of the code from Day 19 in one function; it fits a PCA and prints out all kinds of things along the way
def pca_with_plots(data):
    # covariance
    covariance_matrix = (data.T @ data) / (data.shape[0] - 1)
    print("covariance matrix")
    print(covariance_matrix.shape)

    # Let's look at the covariance matrix
    fig = plt.figure(figsize=(12,12))
    sns.heatmap(pd.DataFrame(covariance_matrix), annot=False, cmap='PuOr')
    plt.show()

    # svd
    (evals, evectors) = np.linalg.eig(covariance_matrix)
    print("eigenvalues, eigenvectors")
    print(evals.shape)
    print(evectors.shape)

    # sort
    evals_order = np.argsort(evals)[::-1]
    evals_sorted = evals[evals_order]
    evectors_sorted = evectors[:, evals_order]

    # proportional variance
    evals_sum = np.sum(evals_sorted)
    proportional_vars = [e / evals_sum for e in evals_sorted]

    # cumulative sum of proportional variance
    print("cum sum prop var")
    cumulative_sum = np.cumsum(proportional_vars)

    # Let's look at the proportional variance
    plt.figure(figsize=(6, 4))
    plt.bar(range(len(proportional_vars)), proportional_vars, alpha=0.5, align='center',
            label='Proportional variance')
    plt.ylabel('Proportional variance ratio')
    plt.xlabel('Ranked Principal Components')
    plt.title("Scree Graph")
    plt.legend(loc='best')
    plt.tight_layout()

    fig = plt.figure(figsize=(6,4))
    ax1 = fig.add_subplot(111)
    ax1.plot(cumulative_sum)
    ax1.set_ylim([0,1.0])
    ax1.set_xlabel('Number of Principal Components')
    ax1.set_ylabel('Cumulative explained variance')
    ax1.set_title('Elbow Plot')
    plt.show()
    
    return evals_sorted, evectors_sorted

In [None]:
# Still our digits data!
data = np.array(np.genfromtxt('data/optdigits/optdigits.tra', delimiter=',', dtype=int)) 
(data, y) = split(data, data.shape[1]-1)

# What are we doing here??
def sums(data):
    y = np.array([np.sum(data[i]) for i in range(data.shape[0])])
    return y

def nonzeros(data):
    y = np.array([(x.shape[0] - np.count_nonzero(x)) / x.shape[0] for x in data])
    return y

data = np.hstack((data, np.array([sums(data), nonzeros(data)]).T))
print("data")
print(data.shape)
print(uf.getSummaryStatistics(data))

In [None]:
# Now center the data, print summary statistics, and then fit a PCA
# What do we observe?

In [None]:
# Now preprocess the data via minmax global and center, print summary statistics, and then fit a PCA
# What do we observe?

In [None]:
# Now preprocess the data via minmax local and center, print summary statistics, and then fit a PCA
# What do we observe?

In [None]:
# Now preprocess the data via zscoring, print summary statistics, and then fit a PCA
# What do we observe?

Some interesting ties between QR decomposition, SVD and PCA:
* https://python.quantecon.org/qr_decomp.html