In [1]:
import time
from nltk import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
import itertools
import numpy as np
import scipy as sp
import scipy.sparse
import scipy.sparse.linalg
import sklearn.metrics.pairwise

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

compute unigram frequencies

In [4]:
s = time.time()
fdist = Counter()
for rec in open ('/data/text8', 'r'):
    rec = rec.strip()
    fdist.update(word_tokenize(rec))
print(time.time()-s)
    

69.63321876525879


Construct vocabulary: remove stopwords and unigrams with frequency less than UNIGRAM_LB. You should aim for around 5000 or fewer words in the vocabulary.

In [5]:
UNIGRAM_LB = 300
sw         = stopwords.words("english")
vocab      = sorted([v for v in fdist.keys() if fdist[v] > UNIGRAM_LB and len(v) >= 3 and v not in sw])
vocab_dict = dict(zip(vocab, range(len(vocab))))
print(len(fdist))
print(len(vocab))

253849
5044


prune text by removed words outside of vocabulary

In [6]:
partial_word = ''
pruned_text  = []
proc = open('pruned-text8.txt', 'w')
with open('/data/text8', 'r') as f:
    for rec in f:
        rec = rec.strip()
        pruned_text_list = [w for w in rec.split() if w in vocab_dict]
        rec = ' '.join(pruned_text_list)
        pruned_text.append(pruned_text_list)
        proc.write(rec + '\n')
pruned_text = list(itertools.chain.from_iterable(pruned_text))

compute cooccurence matrix

In [7]:
CONTEXT_SIZE = 5
#co           = np.zeros((len(vocab), len(vocab)))
s            = time.time()
co_s         = sp.sparse.csr_matrix((len(vocab), len(vocab))) # empty matrix

# for sparse matrix construction
d    = []
rows = []
cols = []

for h_idx in range(len(pruned_text)):
    
    l_idx = max(0, h_idx - CONTEXT_SIZE)
    r_idx = min(len(pruned_text)-1, h_idx + CONTEXT_SIZE)

    for l in range(l_idx, h_idx):
        #co[vocab_dict[pruned_text[h_idx]], vocab_dict[pruned_text[l]]] += 1
        rows.append(vocab_dict[pruned_text[h_idx]])
        cols.append(vocab_dict[pruned_text[l]])
        d.append(1)
        
    for r in range(h_idx, r_idx+1):
        #co[vocab_dict[pruned_text[h_idx]], vocab_dict[pruned_text[r]]] += 1
        rows.append(vocab_dict[pruned_text[h_idx]])
        cols.append(vocab_dict[pruned_text[r]])
        d.append(1)
    
    if h_idx % 1000000 == 0 and h_idx != 0:
        print(h_idx)
        co_s = co_s + sp.sparse.csr_matrix((d, (rows, cols)), shape=(len(vocab), len(vocab)))
        d    = []
        rows = []
        cols = []

if len(d) > 0:
    co_s = co_s + sp.sparse.csr_matrix((d, (rows, cols)), shape=(len(vocab), len(vocab)))

print(time.time()-s)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
88.88820195198059


compute k-SVD of coccurence matrix and pairwise distances between resulting embeddings

In [8]:
U, s, V = scipy.sparse.linalg.svds(co_s, k=50)
# compute pairwise distances between each embedding vector
D = sklearn.metrics.pairwise.pairwise_distances(V.T)

In [8]:
len(V.T[0])
del co_s

find nearest neighbors for some example words

In [9]:
test_words = ['republican', 'physics', 'baseball', 'chicago', 'fish', 'algebra', 'rock', 'food', 'einstein']
for t in test_words:
    print("{}: {}".format(t, [vocab[np.argsort(D[vocab_dict[t],:])[i]] for i in range(1,5)]))

republican: ['senate', 'representatives', 'legislation', 'presidential']
physics: ['chemistry', 'mathematical', 'psychology', 'biology']
baseball: ['sports', 'professional', 'teams', 'jazz']
chicago: ['colleges', 'campus', 'universities', 'texas']
fish: ['plants', 'hot', 'extreme', 'wild']
algebra: ['vector', 'theorem', 'linear', 'binary']
rock: ['musical', 'song', 'playing', 'folk']
food: ['animals', 'effects', 'products', 'variety']
einstein: ['explained', 'creative', 'experiment', 'maxwell']


Now, if you will need to recompute this by transforming the counts to the appropriate PMI values. It will also be necessary to put the resulting vectors into a form that can be read by gensim.