In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import nltk

<h1>Vector space models</h1>

## Converting text to a vector

To convert a block of text to a vector, we are going to first decide on an ordered list of words. Then, to convert a block of text, we'll count how many times each of these words appears in the block of text.

In [None]:
t1 = "now is the time for all good men to come to the aid of their country"
t1w = nltk.word_tokenize(t1)
t2 = "now is the time for all good women to come to the aid of their country"
t2w = nltk.word_tokenize(t2)
t3 = "is it time for the women to lead us all"
t3w = nltk.word_tokenize(t3)

In [None]:
vocab = sorted(list(set(t1w + t2w + t3w)))
print(vocab)

In [None]:
mylist = []
for word in vocab:
    mylist.append(t1w.count(word))

In [None]:
v1 = np.array([t1w.count(word) for word in vocab])
print(v1)

In [None]:
def norm_vec(v):
    return v / np.linalg.norm(v)
np.set_printoptions(precision=3)
v1 = norm_vec(v1)
print(v1)

Once we have vectors for text, we measure the similarity between two blocks of text with the dot product.

In [None]:
v2 = norm_vec(np.array([t2w.count(word) for word in vocab]))
v3 = norm_vec(np.array([t3w.count(word) for word in vocab]))

In [None]:
print("dot product of v1 and v2 is ", np.dot(v1, v2))
print("dot product of v1 and v3 is ", np.dot(v1, v3))
print("dot product of v2 and v3 is ", np.dot(v1, v3))

## Squashing the vectors

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [None]:
X = np.array([v1, v2, v3])

In [None]:
X_squashed = pca.fit_transform(X)
X_squashed

In [None]:
xs = [v[0] for v in X_squashed]
ys = [v[1] for v in X_squashed]
plt.scatter(xs, ys)
labels = ["v1", "v2", "v3"]
for n, v in enumerate(X_squashed):
    plt.annotate(labels[n], (v[0], v[1]), textcoords="offset points", xytext=(5, 5,))