# Download and visualize the data

This notebook shows how to download public data from dropbox to the local machine and do basic manipulations

### Add the src folder to the path

In [3]:
import sys
import os

root_path = os.path.dirname(os.getcwd())
src_path = os.path.join(root_path, "src")
sys.path.insert(0, src_path)

%load_ext autoreload
%autoreload 2

In [None]:
from tissue_purifier.data_utils.datamodule import SlideSeqTestisDM
import argparse

parser = argparse.ArgumentParser()
parser = SlideSeqTestisDM.add_datamodule_specific_args(parser)
(args, _) = parser.parse_known_args()

dm = SlideSeqTestisDM(**args.__dict__)
dm.prepare_data()

### Read in the config file

In [None]:
%matplotlib inline
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=1)
X_new = pca.fit_transform(X)
print(X.shape)
print(X_new.shape)

In [None]:
plt.scatter(X[:, 0], X[:, 1])
plt.scatter(X_new[:, 0], np.ones_like(X_new[:, 0]))

### Following https://en.wikipedia.org/wiki/Principal_component_analysis#Limitations

In [None]:
# 1. z_score
# 2. covariance
# 3. svd 
# 4. multiplication by few columns of V


  # for each feature, remove mean and scale by variance
import numpy

def get_z_score(x: torch.Tensor, dim: int) -> torch.Tensor:
    std, mean = torch.std_mean(x, dim=dim, unbiased=True, keepdim=True)
    return (x-mean)/std


def pca_mine(embeddings, n_components, z_score):
    if z_score:
        embeddings = get_z_score(embeddings, dim=-2)
    else:
        embeddings = embeddings - torch.mean(embeddings, dim=-2, keepdim=True)
        
    cov = torch.einsum('np,nq -> pq', embeddings, embeddings) / (embeddings.shape[0]-1)  # compute the p x p covariance matrix
    U, S, Vh = torch.linalg.svd(cov, full_matrices=True)
    M = U[:, :n_components]
    return torch.einsum('np,pq -> nq', embeddings, M)
    
    
def pca_sklearn(embeddings, n_components, z_score):
    if z_score:
        embeddings = get_z_score(embeddings, dim=-2)
    else:
        embeddings = embeddings - torch.mean(embeddings, dim=-2, keepdim=True)
    
    return PCA(n_components=n_components, random_state=0).fit_transform(embeddings.cpu().numpy())

In [None]:
n_sample, n_feature = 60, 256
X = torch.randn((n_sample, n_feature))
#X = torch.tensor([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]).float()

y1 = pca_mine(X, n_components=2, z_score=True)
y2 = pca_sklearn(X, n_components=2, z_score=True)

print(y1[:10])
print(y2[:10])

In [None]:
plt.scatter(X[:,0], X[:,1], label='X')
plt.scatter(y1[:,0], y1[:,1], label='y1')
plt.legend()

In [None]:
plt.scatter(y1[:,0], y1[:,1], label='y1')
plt.legend()

In [None]:
plt.scatter(y2[:,0], y2[:,1], label='y2')
plt.legend()

In [None]:
plt.scatter(X[:,0], X[:,1])
plt.scatter(y1, 3*torch.ones_like(y1), label='y1')
plt.scatter(y2, 4*torch.ones_like(y1), label='y2')
plt.legend()

In [None]:
print(y1)

In [None]:
print(y2)

In [None]:
plt.scatter(X_new[:, 0], X_new[:, 1])