In [79]:
import numpy as np
import pandas as pd
import numpy.linalg as la
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

## SVD

n_user by n_movie

In [80]:
X = np.array([
    [4, 4, 0, 0],
    [3, 3, 0, 0],
    [5, 5, 0, 0],
    [0, 0, 3, 3],
    [0, 0, 2, 2],
    [0, 0, 5, 5],
])
m, n = X.shape
X

array([[4, 4, 0, 0],
       [3, 3, 0, 0],
       [5, 5, 0, 0],
       [0, 0, 3, 3],
       [0, 0, 2, 2],
       [0, 0, 5, 5]])

In [81]:
# Reduced
U, s, Vh = la.svd(X, full_matrices=False)
Sigma = np.diag(s)

U.shape, s.shape, Sigma.shape, Vh.shape

((6, 4), (4,), (4, 4), (4, 4))

In [82]:
s

array([1.00000000e+01, 8.71779789e+00, 8.88178420e-16, 7.02166694e-16])

In [83]:
U

array([[-0.56568542,  0.        ,  0.8       , -0.18862383],
       [-0.42426407,  0.        , -0.49032552, -0.71799857],
       [-0.70710678,  0.        , -0.34580469,  0.5816982 ],
       [ 0.        , -0.48666426,  0.        ,  0.01176878],
       [ 0.        , -0.32444284,  0.        ,  0.30598825],
       [ 0.        , -0.81110711,  0.        , -0.12945657]])

In [84]:
np.round(U * s, 1)

array([[-5.7,  0. ,  0. , -0. ],
       [-4.2,  0. , -0. , -0. ],
       [-7.1,  0. , -0. ,  0. ],
       [ 0. , -4.2,  0. ,  0. ],
       [ 0. , -2.8,  0. ,  0. ],
       [ 0. , -7.1,  0. , -0. ]])

In [85]:
np.allclose(X, U * s @ Vh)

True

In [86]:
np.isclose(X, U * s @ Vh)

array([[ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True],
       [ True,  True,  True,  True]])

In [87]:
k = 2

Compare the 2 below

In [88]:
(U * s).round(2)

array([[-5.66,  0.  ,  0.  , -0.  ],
       [-4.24,  0.  , -0.  , -0.  ],
       [-7.07,  0.  , -0.  ,  0.  ],
       [ 0.  , -4.24,  0.  ,  0.  ],
       [ 0.  , -2.83,  0.  ,  0.  ],
       [ 0.  , -7.07,  0.  , -0.  ]])

In [89]:
(U[:,:k] * s[:k]).round(2)

array([[-5.66,  0.  ],
       [-4.24,  0.  ],
       [-7.07,  0.  ],
       [ 0.  , -4.24],
       [ 0.  , -2.83],
       [ 0.  , -7.07]])

Compare the 2 below

In [90]:
(Vh.T * s).round(2)

array([[-7.07, -0.  , -0.  ,  0.  ],
       [-7.07, -0.  ,  0.  ,  0.  ],
       [-0.  , -6.16,  0.  , -0.  ],
       [-0.  , -6.16,  0.  ,  0.  ]])

In [91]:
(Vh[:k].T * s[:k]).round(2)

array([[-7.07, -0.  ],
       [-7.07, -0.  ],
       [-0.  , -6.16],
       [-0.  , -6.16]])

embeddings of users

In [92]:
eb_u = U[:,:k] * s[:k]
(eb_u).round(2)

array([[-5.66,  0.  ],
       [-4.24,  0.  ],
       [-7.07,  0.  ],
       [ 0.  , -4.24],
       [ 0.  , -2.83],
       [ 0.  , -7.07]])

embeddings of movies

In [93]:
eb_m = Vh[:k].T
(eb_m).round(2)

array([[-0.71, -0.  ],
       [-0.71, -0.  ],
       [-0.  , -0.71],
       [-0.  , -0.71]])

Compare the 2 below: X = user_embedding @ movie_embedding

In [94]:
(eb_u @ eb_m.T)

array([[4., 4., 0., 0.],
       [3., 3., 0., 0.],
       [5., 5., 0., 0.],
       [0., 0., 3., 3.],
       [0., 0., 2., 2.],
       [0., 0., 5., 5.]])

In [95]:
X

array([[4, 4, 0, 0],
       [3, 3, 0, 0],
       [5, 5, 0, 0],
       [0, 0, 3, 3],
       [0, 0, 2, 2],
       [0, 0, 5, 5]])

Compare the two below: user_embedding = X @ moving_embedding

In [96]:
eb_u.round(2)

array([[-5.66,  0.  ],
       [-4.24,  0.  ],
       [-7.07,  0.  ],
       [ 0.  , -4.24],
       [ 0.  , -2.83],
       [ 0.  , -7.07]])

In [97]:
(X @ eb_m).round(2)

array([[-5.66,  0.  ],
       [-4.24,  0.  ],
       [-7.07,  0.  ],
       [ 0.  , -4.24],
       [ 0.  , -2.83],
       [ 0.  , -7.07]])

## PCA equivalency

In [98]:
pca = PCA(n_components=k)
X_mean = X.mean(axis=0)
X_nrm = X - X_mean
X_nrm.round(2)

array([[ 2.  ,  2.  , -1.67, -1.67],
       [ 1.  ,  1.  , -1.67, -1.67],
       [ 3.  ,  3.  , -1.67, -1.67],
       [-2.  , -2.  ,  1.33,  1.33],
       [-2.  , -2.  ,  0.33,  0.33],
       [-2.  , -2.  ,  3.33,  3.33]])

In [99]:
U_, s_, Vh_ = la.svd(X_nrm, full_matrices=False)

Compare the 2 below: they are the embedding of users

In [100]:
(pca.fit_transform(X_nrm)).round(2)

array([[-3.68,  0.12],
       [-2.62, -0.82],
       [-4.74,  1.06],
       [ 3.37, -0.47],
       [ 2.43, -1.53],
       [ 5.25,  1.64]])

In [101]:
eb_u_ = U_[:, :k]*s_[:k]
(eb_u_).round(2)

array([[-3.68,  0.12],
       [-2.62, -0.82],
       [-4.74,  1.06],
       [ 3.37, -0.47],
       [ 2.43, -1.53],
       [ 5.25,  1.64]])

Compare the two below: they are the embedding of movings

In [102]:
eb_m_ = Vh_[:k].T
(eb_m_).round(2)

array([[-0.53,  0.47],
       [-0.53,  0.47],
       [ 0.47,  0.53],
       [ 0.47,  0.53]])

In [103]:
(pca.components_).round(2)

array([[-0.53, -0.53,  0.47,  0.47],
       [ 0.47,  0.47,  0.53,  0.53]])

Compare the 2 below: X = user_embedding @ movie_embedding

In [104]:
(eb_u_ @ eb_m_.T).round(2)

array([[ 2.  ,  2.  , -1.67, -1.67],
       [ 1.  ,  1.  , -1.67, -1.67],
       [ 3.  ,  3.  , -1.67, -1.67],
       [-2.  , -2.  ,  1.33,  1.33],
       [-2.  , -2.  ,  0.33,  0.33],
       [-2.  , -2.  ,  3.33,  3.33]])

In [105]:
(X_nrm).round(2)

array([[ 2.  ,  2.  , -1.67, -1.67],
       [ 1.  ,  1.  , -1.67, -1.67],
       [ 3.  ,  3.  , -1.67, -1.67],
       [-2.  , -2.  ,  1.33,  1.33],
       [-2.  , -2.  ,  0.33,  0.33],
       [-2.  , -2.  ,  3.33,  3.33]])

Compare the two below: user_embedding = X @ moving_embedding

In [106]:
(eb_u_).round(2)

array([[-3.68,  0.12],
       [-2.62, -0.82],
       [-4.74,  1.06],
       [ 3.37, -0.47],
       [ 2.43, -1.53],
       [ 5.25,  1.64]])

In [107]:
(X_nrm @ eb_m_).round(2)

array([[-3.68,  0.12],
       [-2.62, -0.82],
       [-4.74,  1.06],
       [ 3.37, -0.47],
       [ 2.43, -1.53],
       [ 5.25,  1.64]])

# big number example

In [108]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import numpy.linalg as la

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [109]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [110]:
len(newsgroups_train), type(newsgroups_train)

(5, sklearn.utils.Bunch)

In [78]:
len(newsgroups_test), type(newsgroups_test)

(5, sklearn.utils.Bunch)

In [None]:
vectorizer = CountVectorizer(stop_words='english', binary=True)
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
m, n = vectors.shape
m, n

(2034, 26576)

In [None]:
newsgroups_train.target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [None]:
vocab = np.array(vectorizer.get_feature_names())

vocab.shape



(26576,)

In [None]:
vocab[10000: 10010]

array(['factors', 'factory', 'facts', 'factsnet', 'factual', 'factually',
       'faculty', 'fade', 'fades', 'fading'], dtype='<U80')

X is n_doc by n_vocab

In [None]:
X = np.array(vectors)
U, s, Vh = la.svd(X, full_matrices=False)

In [None]:
U.shape, s.shape, Vh.shape

((2034, 2034), (2034,), (2034, 26576))

In [None]:
np.allclose(X, U * s @ Vh)

True

In [None]:
k = 300

embeddings of docs

In [None]:
eb_d = U[:,:k] * s[:k]

embeddings of vocabs

In [None]:
eb_v = Vh[:k]

In [None]:
eb_d.shape, eb_v.shape

((2034, 300), (300, 26576))

In [None]:
(eb_d @ eb_v).shape, X.shape

((2034, 26576), (2034, 26576))