In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import utilityfunctions as uf

# PCA Review; PCA and Regression

Acknowledgments:
* https://charlesreid1.github.io/circe/Digit%20Classification%20-%20PCA.html

Data:
* https://archive.ics.uci.edu/ml/datasets/optical+recognition+of+handwritten+digits

__Take a look at the data__.

*The independent variables*: each data point (observation) is an 8 pixel by 8 pixel grayscale image. We flatten this so each data point is represented by a 64 floating point one dimensional array. 

*The dependent variable*: the label is the number in the image, 0 ... 9.

*Reason for using PCA*: if we can project from 64 dimensions down to some number less than 10, we can fit a regression (or other!) model more efficiently.

### Load and separate and examine the training data

In [None]:
# Let's load the training data
columns=['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature','chorus_hit','sections','popularity']
data = np.array(np.genfromtxt('data/spotify_dataset.csv', delimiter=',', skip_header=1, usecols=(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17), dtype=float, encoding='utf-8'))  

print(uf.getShapeType(data))
train, test = np.split(data, [int(.8 * len(data))])
print(uf.getShapeType(train))
print(uf.getShapeType(test))


In [None]:
# Let's split off the labels
def split(data, ycol):
    y = data[:, ycol]
    xfirst = data[:, 0:ycol]
    xsecond = data[:, ycol+1:data.shape[1]]
    return (np.hstack((xfirst, xsecond)), y)

In [None]:
(x, y) = split(train, 0)
print(uf.getShapeType(y))
print(uf.getShapeType(x))

### Normalize and center the data

In [None]:
# normalize and center
def center(data):
    centered = data - data.mean(axis=0)
    return centered

def preprocess(data, minmax=False, local=False, zscore=False):
    if minmax == True and zscore == True:
        print("Nope, won't do that!")
        return data
    elif minmax == True:
        if local == False:
            data = uf.minmaxGlobal(data)
        else:
            data = uf.minmaxLocal(data)
        return center(data)
    elif zscore == True:
        return uf.zScore(data)   

In [None]:
centered = preprocess(x, zscore=True)
print(uf.getShapeType(centered))
print(uf.getSummaryStatistics(centered))

### Fit the PCA

In [None]:
# From day 19

# This is most of the code from Day 19 in one function; it fits a PCA and prints out all kinds of things along the way
def pca_with_plots(data, columns):
    # covariance
    covariance_matrix = (data.T @ data) / (data.shape[0] - 1)
    print("covariance matrix")
    print(covariance_matrix.shape)

    # Let's look at the covariance matrix
    fig = plt.figure(figsize=(12,12))
    sns.heatmap(pd.DataFrame(covariance_matrix), xticklabels=columns, yticklabels=columns, annot=False, cmap='PuOr')
    plt.show()

    # svd
    (evals, evectors) = np.linalg.eig(covariance_matrix)

    # sort
    evals_order = np.argsort(evals)[::-1]
    evals_sorted = evals[evals_order]
    evectors_sorted = evectors[:, evals_order]

    # proportional variance
    evals_sum = np.sum(evals_sorted)
    proportional_vars = [e / evals_sum for e in evals_sorted]

    # cumulative sum of proportional variance
    cumulative_sum = np.cumsum(proportional_vars)

    # Let's look at the proportional variance
    plt.figure(figsize=(6, 4))
    plt.bar(range(len(proportional_vars)), proportional_vars, alpha=0.5, align='center',
            label='Proportional variance')
    plt.ylabel('Proportional variance ratio')
    plt.xlabel('Ranked Principal Components')
    plt.title("Scree Graph")
    plt.legend(loc='best')
    plt.tight_layout()

    fig = plt.figure(figsize=(6,4))
    ax1 = fig.add_subplot(111)
    ax1.plot(cumulative_sum)
    ax1.set_ylim([0,1.0])
    ax1.set_xlabel('Number of Principal Components')
    ax1.set_ylabel('Cumulative explained variance')
    ax1.set_title('Elbow Plot')
    plt.show()
    
    return evals_sorted, evectors_sorted

In [None]:
eigenvals, eigenvecs = pca_with_plots(centered, columns)

### Projecting the data

In [None]:
# Let's project the data into this PCA space
def project(data, eigenvectors, numberOfComponents):
    v = eigenvectors[:, :numberOfComponents]
    projected = data@v
    return projected

In [None]:
# Looking at this elbow plot, how many principal components do you think we want to keep?
keep = 15
projected = project(centered, eigenvecs, keep)
print(uf.getShapeType(projected))

### What can we do with data in PCA space? 

* Look at it!
* Fit models!

It's kind of weird to use linear regression for this dataset. Linear regression is meant for data where the labels are quantitative ordinal. How do we know this?

Although the integers 0..9 are ordered, here they are just labels. But we don't (yet!) know any other type of model to fit. What to do? 

Let's make up some y's - let's set y to the sum of the values in each row. 

But, you say, that's just a regular linear function. 

*Indeed, say I*. But if I gave you that data and didn't tell you anything, you'd have to fit a model to find that out.

In [None]:
def makeRegressionY(data):
    y = np.array([np.sum(data[i]) for i in range(data.shape[0])])
    return y
yp = makeRegressionY(x)
print(yp.shape)

In [None]:
# Remember y? Let's fit a linear regression model to the projected data

def fit(projected, independent, y):
    projectedandy = np.hstack((projected, np.array([y]).T))
    print(uf.getShapeType(projectedandy))
    c = uf.fitlstsq(projectedandy, independent, projectedandy.shape[1]-1)
    return c

In [None]:
%%time
c = fit(projected, list(range(keep)), yp)
print(c)

In [None]:
# Remember, we have test data too!

In [None]:
# Let's load the test data
(xt, yt) = split(test, 0)

# Make sure we preprocess the same way!
centeredt = preprocess(xt, zscore=True)
print(uf.getShapeType(centeredt))

# Why are we not calculating the PCA again?
projectedt = project(centeredt, eigenvecs, keep)
print(uf.getShapeType(projectedt))

In [None]:
%%time
yhat = uf.predict(projectedt, list(range(keep)), c)

In [None]:
ypt = makeRegressionY(xt)
print(uf.rsquared(ypt, yhat))

### Comparing PCA to not PCA

In [None]:
# Let's compare with a regression fit on the original data
# We will, but don't have to, normalize and center first
centered = preprocess(x, zscore=True)
print(uf.getShapeType(centered))

In [None]:
%%time
c1 = fit(centered, [x for x in range(centered.shape[1]-1)], yp)

In [None]:
centeredt = preprocess(xt, zscore=True)
print(uf.getShapeType(centeredt))

In [None]:
%%time
yhat1 = uf.predict(centeredt, [x for x in range(centeredt.shape[1]-1)], c1)

In [None]:
print(uf.rsquared(ypt, yhat1))

## Reconstructing the original matrix

In [None]:
# from day 18
def projectBack(data, eigenvectors, numberOfComponents):
    v = eigenvectors[:, :numberOfComponents]
    projected = data@v.T
    return projected

In [None]:
reconstructed = projectBack(projected, eigenvecs, keep)

print(uf.getShapeType(reconstructed))
print(uf.getSummaryStatistics(reconstructed))

## Resources

* https://www.displayr.com/8-tips-for-interpreting-r-squared/