# PCA (Principal Component Analysis)

Dimensional reduction algorithm

In [None]:
import pycompss.interactive as ipycompss

In [None]:
# Start PyCOMPSs runtime with graph and tracing enabled
ipycompss.start(graph=True, trace=True)

In [None]:
from pycompss.api.task import task
from pycompss.api.parameter import *
import numpy as np

## Task definitions

In [None]:
@task(returns=np.float64)
def _meanVector(sample):
    return np.mean(sample)

@task(returns=list)
def normalize(data, mean):
    return list(map(lambda x: x - mean, data))

@task(returns=np.float64)
def dotProduct(P, Q):
    import sys
    val = map(lambda p: p[0].dot(p[1].T), zip(P, Q))
    if sys.version_info[0] >= 3:
        from functools import reduce
    else:
        from __builtin__ import reduce
    sm = reduce(lambda x, y: x + y, val, 0)
    return sm

#@task(returns=list)
def eigenValues(scatter_matrix):
    eig_val, eig_vec = np.linalg.eig(scatter_matrix)
    eig = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(len(eig_val))]
    return eig

#@task(returns=np.ndarray)
def transform(data, eig, dim):
    eig_sorted = sorted(eig, key=lambda x: x[0], reverse=True)
    w = np.hstack([eig_sorted[i][1].reshape(dim, 1) for i in range(dim - 1)])
    transform_dim = w.T.dot(data)
    return transform_dim

## Functions

In [None]:
def generateData(numV, dim, K):
    n = int(float(numV) / K)
    data = []
    np.random.seed(8)
    cov = np.eye(dim)
    for k in range(K):
        mu = [k] * dim
        data.append(np.random.multivariate_normal(mu, cov, n).T)
    return np.concatenate(([data[i] for i in range(K)]), axis=1)

In [None]:
def meanVector(samples):
    m = list(map(_meanVector, samples))
    return m

In [None]:
def scatterMatrix_d(data, mean, dim):
    sm = [[0 for _ in range(dim)] for _ in range(dim)]
    points = []
    for i in range(dim):
        points.append(normalize(data[i], mean))
    for i in range(dim):
        for j in range(dim):
            sm[i][j] = dotProduct(points[i], points[j])
    return sm

In [None]:
def scatterMatrix(samples, mean_vector, dim):
    data = [samples[:, i] for i in range(len(samples[0]))]
    sm = np.zeros((dim, dim))
    for p in data:
        pt = p.reshape(dim, 1)
        sm += (pt - mean_vector).dot((pt - mean_vector).T)
    return sm

In [None]:
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import Axes3D

class Arrow3D(FancyArrowPatch):

    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0, 0), (0, 0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def draw(self, renderer):
        from mpl_toolkits.mplot3d import proj3d
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0], ys[0]), (xs[1], ys[1]))
        FancyArrowPatch.draw(self, renderer)

def show(data, transformData, mean, eig, classes):
    %matplotlib inline
    from matplotlib import pyplot as plt
    fig = plt.figure(figsize=(8, 8))

    ax = fig.add_subplot(111, projection='3d')

    numPoints = len(data[0]) / classes
    obj = ['o', 'x', '^']
    for c in list(range(classes)):
        s = int(c * numPoints)
        e = int(s + numPoints)
        ax.plot(data[0][s:e], data[1][s:e], data[2][s:e], obj[c])

    ax.plot([mean[0]], [mean[1]], [mean[2]], 'o', color='red')
    for n, w in eig:
        v = w.T
        a = Arrow3D([mean[0], v[0] + mean[0]], 
                    [mean[1], v[1] + mean[1]], 
                    [mean[2], v[2] + mean[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
        ax.add_artist(a)
    #plt.savefig('PCA3dim.png')
    plt.show()
    
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    for c in range(classes):
        s = int(c * numPoints)
        e = int(s + numPoints)
        ax.plot(transformData[0][s:e], transformData[1][s:e], obj[c])

    #plt.savefig('PCA2dim.png')
    plt.show()


## MAIN Code

Parameters (that can be configured in the following cell):
* numPoints: Number of points (default: 1.000)
* dim: Number of dimensions (default: 3)
* classes: Number of classes (default: 3)

In [None]:
import time
from pycompss.api.api import compss_wait_on

numPoints = 1000
dim = 3
classes = 3

st = time.time()
data = generateData(numPoints, dim, classes)
m = meanVector(data)
print("Generation time (s): {}".format(time.time() - st))

scatter_matrix = scatterMatrix_d(data, m, dim)
scatter_matrix = compss_wait_on(scatter_matrix)
print("Elapsed time (s): {}".format(time.time() - st))

eig = eigenValues(scatter_matrix)
transform_dim = transform(data, eig, dim)

In [None]:
# Plot Result
show(data, transform_dim, m, eig, classes)

In [None]:
ipycompss.stop()