In [None]:
import itertools
import numpy as np
from nltk.corpus import brown
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sympy import Matrix

## Train LSA model based on very small corpus

In [None]:
CORPUS_SIZE = 1000
brown_sample = [" ".join([w.lower() for w in itertools.chain(*x) if w.isalpha()]) 
                for x in brown.paras()[:CORPUS_SIZE]]
brown_sample[0]

In [None]:
WORDS = ["football", "baseball", "ball", "players", "coach"]

### Examine word representations prior to SVD

In [None]:
vectorizer = CountVectorizer(stop_words="english", min_df=10)
term_doc_matrix = vectorizer.fit_transform(brown_sample)

In [None]:
term_doc_matrix.shape

In [None]:
list(vectorizer.vocabulary_.items())[:10]

#### Sparsity of vectors for 5 selected words

In [None]:
col_indices = [vectorizer.vocabulary_[w] for w in WORDS]
for i, w in enumerate(WORDS):
    doc_count = term_doc_matrix[:,col_indices[i]].todense().astype(bool).sum()
    print(f"Vector for '{w:8s}' has {doc_count} nonzero elements ({100*doc_count/1000}%)%")

#### Last 100 rows (documents) for 5 selected columns (words) of term-document matrix

In [None]:
Matrix(term_doc_matrix[900:,col_indices].todense())

### Examine reduced-dimensionality representations

#### Apply SVD

In [None]:
svd = TruncatedSVD(n_components=30)
approx_term_document_matrix = svd.fit_transform(term_doc_matrix)
svd_w = svd.components_
svd_w.shape

#### Sparsity of vectors for 5 selected vectors

In [None]:
for i, w in enumerate(WORDS):
    doc_count = svd_w[:,col_indices[i]].astype(bool).sum()
    print(f"Vector for '{w:8s}' has {doc_count} nonzero elements ({100*doc_count/30:0.1f}%)")

#### Vectors for 5 selected words from LSA

In [None]:
Matrix(svd_w[:,col_indices]).evalf(2)

### Look at word similarities

In [None]:
def term_doc_similarity(w1, w2):
    col1 = vectorizer.vocabulary_[w1]
    col2 = vectorizer.vocabulary_[w2]
    vec1 = np.asarray(term_doc_matrix[:,col1].todense()).flatten()
    vec2 = np.asarray(term_doc_matrix[:,col2].todense()).flatten()
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1) * np.dot(vec2, vec2))

In [None]:
term_doc_similarity("players", "ball")

In [None]:
term_doc_similarity("football", "baseball")

In [None]:
term_doc_similarity("players", "coach")

In [None]:
term_doc_similarity("football", "ball")

In [None]:
def lsa_similarity(w1, w2):
    col1 = vectorizer.vocabulary_[w1]
    col2 = vectorizer.vocabulary_[w2]
    vec1 = svd_w[:,col1]
    vec2 = svd_w[:,col2]
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1) * np.dot(vec2, vec2))

In [None]:
lsa_similarity("players", "ball")

In [None]:
lsa_similarity("football", "baseball")

In [None]:
lsa_similarity("players", "coach")

In [None]:
lsa_similarity("players", "jury")

In [None]:
lsa_similarity("football", "ball")