In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl

from sklearn import preprocessing
from sklearn.decomposition import PCA

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Util

In [2]:
def read_csv(file_name_prefix, partition=''):
  # np.genfromtxt('trainData1.csv', dtype=int, delimiter=',')
  return pd.read_csv(f'{file_name_prefix}{partition}.csv', header=None).values

def read_train_data(num_partitions):
  train_data = [read_csv('trainData', i + 1) for i in range(num_partitions)]
  train_labels = [read_csv('trainLabels', i + 1) for i in range(num_partitions)]
  # y is a scalar
  return train_data, train_labels

def read_test_data():
  test_data = read_csv('testData')
  test_labels = read_csv('testLabels')
  # y is a scalar
  return test_data, test_labels

def norm(v):
    norm_v = np.linalg.norm(v, axis=1)[:, np.newaxis]
    norm_v[norm_v == 0] = 1
    return v / norm_v

In [3]:
def plot_pca(x, y, n_components=None, marker_size=6):
    """
    - If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's MLE is used to guess the dimension.
    - If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the number of components such that the 
      amount of variance that needs to be explained is greater than the percentage specified by n_components.
    """
    pca = PCA(n_components=n_components, svd_solver='full')
    x_pca = pca.fit_transform(x)
    labels = {}
    print(f'Number of components: {pca.n_components_}')
    print(f'Total variance: {pca.explained_variance_ratio_.sum()}')
    print(f'x_pca.shape: {x_pca.shape}')
    plot_scatter_matrix(x_pca, y, labels=labels, marker_size=marker_size)
    return pca, x_pca


def plot_scatter_matrix(x, y, dim=[], labels={}, height=1700, width=1700, marker_size=6):
    dim = dim or range(x.shape[1])
    fig = px.scatter_matrix(
        x,
        labels=labels,
        dimensions=dim,
        color=y,
    )
    fig.update_traces(diagonal_visible=False, showupperhalf = False, marker=dict(size=marker_size, colorscale='Rainbow'))
    fig.update_layout(height=height, width=width)
    fig.show()


def plot_scatter_x_pairs(x, y, pairs, rows, cols, title=None, height=None, width=None, marker_size=4):
    subplot_titles = [f'X[{i}] x X[{j}]' for i, j in pairs]
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

    fig_args = dict(mode='markers', marker=dict(color=y, size=marker_size))
    for k, pair in enumerate(pairs):
        fig.add_trace(go.Scatter(x=x[:, pair[0]], y=x[:, pair[1]], **fig_args), row=(k // cols) + 1, col=(k % cols) + 1)
    fig.update_layout(height=height, width=width, title_text=title)

    fig.show()

# Data

In [4]:
# read data
xtrp, ytrp = read_train_data(num_partitions=10)
xtr, ytr = np.concatenate(xtrp), np.concatenate(ytrp)
xte, yte = read_test_data()

# normalize
xtr, xte = norm(xtr), norm(xte)

# y will be used as a scalar
ytr, yte = ytr[:, 0], yte[:, 0]

In [7]:
xtr.shape, ytr.shape, xte.shape, yte.shape

((1000, 64), (1000,), (110, 64), (110,))

In [8]:
ytrdf = pd.DataFrame(data={'ytr': ytr}, dtype=int)
ytedf = pd.DataFrame(data={'yte': yte}, dtype=int)

In [9]:
ytrdf.groupby(['ytr']).size()

ytr
5    500
6    500
dtype: int64

In [10]:
ytedf.groupby(['yte']).size().reset_index(name='counts')

Unnamed: 0,yte,counts
0,5,51
1,6,59


In [11]:
map_label = np.vectorize(lambda n: 'Five' if n == 5 else 'Six')
ytel = map_label(yte)
ytrl = map_label(ytr)

In [12]:
plot_scatter_matrix(xtr, ytrl, marker_size=1)

In [13]:
px.scatter(xtr, x=42, y=36, color=ytrl, width=500, height=400, size=ytr, size_max=4).show()

In [14]:
pca, x_pca = plot_pca(xtr, ytrl, n_components=0.999, marker_size=1)

Number of components: 64
Total variance: 0.9999999999999999
x_pca.shape: (1000, 64)


In [15]:
component_pairs = [(1, 0), (1, 2), (1, 3), (0, 63), (0, 2), (0, 3)]

plot_scatter_x_pairs(x_pca, ytr, component_pairs, rows=2, cols=3, title='PCA Analysis', height=800, width=1200)

In [16]:
lr = np.zeros([100, 2])
for i, threshold in enumerate(np.arange(-.05, .05, .001)):
    fives = (ytr[x_pca[:, 1] < threshold] == 5).sum()
    sixes = (ytr[x_pca[:, 1] >= threshold] == 6).sum()
    lr[i] = [threshold, (fives + sixes) * 100. / ytr.shape[0]]

In [17]:
lr[lr[:, 1].argmax()]

array([-2.20e-02,  8.41e+01])

# Principal component analysis (PCA)

- If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's MLE is used to guess the dimension.
- If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.


In [18]:
pca_tr = PCA()
xtrpca = pca_tr.fit_transform(xtr)

In [19]:
xtr.shape, xtrpca.shape

((1000, 64), (1000, 64))

In [20]:
pca_tr.singular_values_

array([8.60231065, 5.66831466, 4.45049682, 4.3381513 , 4.2823701 ,
       4.0696458 , 3.80463493, 3.77463667, 3.69809645, 3.65382994,
       3.49904151, 3.37824313, 3.3310908 , 3.16696442, 3.10708133,
       3.0362003 , 2.99347355, 2.85758109, 2.80357287, 2.74975129,
       2.68793264, 2.61283331, 2.60314875, 2.52966663, 2.49534525,
       2.44100342, 2.36347987, 2.33010085, 2.24214543, 2.22472475,
       2.14274428, 2.12345264, 2.11364861, 2.02974936, 1.97382797,
       1.8929321 , 1.88140965, 1.83387811, 1.7763135 , 1.70700572,
       1.68491877, 1.64295056, 1.60936126, 1.56163548, 1.52951365,
       1.47702205, 1.42945054, 1.42176799, 1.38147447, 1.33239792,
       1.26774923, 1.24312822, 1.23451006, 1.17122423, 1.10941272,
       1.09000704, 1.05897564, 1.04270391, 1.0195865 , 0.98542529,
       0.936015  , 0.92050664, 0.87312735, 0.84557944])

In [21]:
pca_tr.explained_variance_ratio_

array([0.1578238 , 0.06852518, 0.04224344, 0.04013762, 0.03911206,
       0.03532283, 0.03087225, 0.03038733, 0.02916747, 0.02847337,
       0.02611202, 0.0243402 , 0.02366547, 0.02139088, 0.02058958,
       0.01966089, 0.01911143, 0.01741564, 0.01676355, 0.01612609,
       0.01540916, 0.01456014, 0.01445241, 0.01364799, 0.01328017,
       0.01270805, 0.01191368, 0.01157955, 0.01072185, 0.01055589,
       0.00979226, 0.00961673, 0.00952813, 0.00878673, 0.00830923,
       0.00764209, 0.00754934, 0.00717271, 0.00672948, 0.00621459,
       0.00605481, 0.00575693, 0.00552395, 0.00520118, 0.00498941,
       0.00465282, 0.00435793, 0.00431122, 0.00407031, 0.00378626,
       0.00342775, 0.0032959 , 0.00325036, 0.00292565, 0.002625  ,
       0.00253397, 0.00239174, 0.00231881, 0.00221713, 0.00207105,
       0.00186856, 0.00180716, 0.00162591, 0.00152493])

In [22]:
pca_tr.explained_variance_ratio_.sum()

0.9999999999999999

In [23]:
pca_tr.n_components_

64

In [24]:
pca_tr.components_.shape

(64, 64)

# Singular Value Decomposition (SVD)

Factorizes the matrix `a` into two unitary matrices ``U`` and ``Vh``, and a 1-D array ``s`` of singular values (real, non-negative) such that:
```
a == U @ S @ Vh
```
where ``S`` is a suitably shaped matrix of zeros with main diagonal ``s``.

Unitary matrices are the complex equivalent of an orthogonal matrix.

An orthogonal matrix A is a matrix which its transpose is equal to its inverse:
```
A.T = A.I
```


In [25]:
U, S, V = sp.linalg.svd(xtr, full_matrices=False)

In [26]:
U.shape, np.diag(S).shape, V.shape

((1000, 64), (64, 64), (64, 64))

In [27]:
np.diag(np.ones(4))

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [28]:
tol = 1e-14
xtr_svd = U @ np.diag(S) @ V
(np.abs(xtr - xtr_svd) < tol).all()

True