## Singular Value Decomposition (SVD) in Python

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
%matplotlib inline

In [2]:
# example from CS224N
corpus = [
    'I like deep learning',
    'I like NLP',
    'I enjoy flying'
]

cnt_vectorizer = CountVectorizer(ngram_range=(1, 1))
word_doc_cnt_matrix = cnt_vectorizer.fit_transform(corpus).toarray()
word_cooc_matrix = word_doc_cnt_matrix.T.dot(word_doc_cnt_matrix)
print(word_doc_cnt_matrix.shape, word_cooc_matrix.shape)

(3, 6) (6, 6)


### Comparison between `sklearn.decomposition.TruncatedSVD` and `np.linalg.svd`

#### Apply `np.linalg.svd` and check reconstruction

In [3]:
U, S, Vh = np.linalg.svd(word_doc_cnt_matrix, full_matrices=False)
print(U.shape, S.shape, Vh.shape)

(3, 3) (3,) (3, 6)


In [4]:
print(f'\nU =\n{np.round(U, 3)}')
print(f'\nS =\n{np.round(S, 3)}')
print(f'\nV =\n{np.round(Vh, 3)}')


U =
[[-0.851  0.    -0.526]
 [-0.526  0.     0.851]
 [ 0.     1.     0.   ]]

S =
[1.902 1.414 1.176]

V =
[[-0.447  0.     0.    -0.447 -0.724 -0.276]
 [-0.     0.707  0.707 -0.     0.     0.   ]
 [-0.447  0.     0.    -0.447  0.276  0.724]]


In [5]:
word_doc_cnt_matrix_recon =  (U.dot(np.diag(S))).dot(Vh)

print(f'\nOriginal matrix =\n{word_doc_cnt_matrix}')
print(f'\nReconstructed matrix =\n{np.round(word_doc_cnt_matrix_recon)}')


Original matrix =
[[1 0 0 1 1 0]
 [0 0 0 0 1 1]
 [0 1 1 0 0 0]]

Reconstructed matrix =
[[ 1.  0.  0.  1.  1. -0.]
 [-0.  0.  0. -0.  1.  1.]
 [-0.  1.  1. -0.  0.  0.]]


In [6]:
print('Reconstruction close?', np.allclose(word_doc_cnt_matrix, word_doc_cnt_matrix_recon))

Reconstruction close? True


#### Apply `sklearn.decomposition.TruncatedSVD` and check transformation

In [7]:
svd = TruncatedSVD(n_components=3, random_state=2020)
svd.fit(word_doc_cnt_matrix)

TruncatedSVD(algorithm='randomized', n_components=3, n_iter=5,
             random_state=2020, tol=0.0)

### Verification

#### 1. `svd.components_` is equivalent to `Vh` from `np.linalg.svd(X)`

In [8]:
np.round(svd.components_, 3)

array([[ 0.447, -0.   , -0.   ,  0.447,  0.724,  0.276],
       [ 0.   ,  0.707,  0.707,  0.   , -0.   , -0.   ],
       [-0.447, -0.   , -0.   , -0.447,  0.276,  0.724]])

In [9]:
np.round(Vh[:3], 3)

array([[-0.447,  0.   ,  0.   , -0.447, -0.724, -0.276],
       [-0.   ,  0.707,  0.707, -0.   ,  0.   ,  0.   ],
       [-0.447,  0.   ,  0.   , -0.447,  0.276,  0.724]])

#### 2. `svd.transform(X)` is equivalent to `U x S` from `np.linalg.svd(X)`

In [10]:
word_doc_cnt_matrix_trm = svd.transform(word_doc_cnt_matrix)
np.round(word_doc_cnt_matrix_trm, 3)

array([[ 1.618,  0.   , -0.618],
       [ 1.   , -0.   ,  1.   ],
       [-0.   ,  1.414, -0.   ]])

In [11]:
np.round(U.dot(np.diag(S)), 3)

array([[-1.618,  0.   , -0.618],
       [-1.   ,  0.   ,  1.   ],
       [ 0.   ,  1.414,  0.   ]])

#### 3. `svd.transform(X)` is equivalent to `X x svd.components_`: $US = XV^{T}$

In [12]:
np.round(word_doc_cnt_matrix.dot(svd.components_.T), 3)

array([[ 1.618,  0.   , -0.618],
       [ 1.   , -0.   ,  1.   ],
       [-0.   ,  1.414, -0.   ]])

In [13]:
np.round(word_doc_cnt_matrix_trm, 3)

array([[ 1.618,  0.   , -0.618],
       [ 1.   , -0.   ,  1.   ],
       [-0.   ,  1.414, -0.   ]])