# <span style='color:blue'><center>Word Embeddings</center></span>

Represent text based on the context around which each word appears.

*Example taken from edx: 'DSE220x : Machine Learning Fundamentals'.*

In [2]:
import numpy as np
import pickle
import nltk
nltk.download('brown')
nltk.download('stopwords')
from nltk.corpus import brown, stopwords #The corpus consists of 500 samples of text drawn from a wide range of sources.
from scipy.cluster.vq import kmeans2
from sklearn.decomposition import PCA

[nltk_data] Downloading package brown to /home/cristian/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cristian/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
for i in range(50): #There are over a million words, here, check the first 50 of them
    print (brown.words()[i],)

The
Fulton
County
Grand
Jury
said
Friday
an
investigation
of
Atlanta's
recent
primary
election
produced
``
no
evidence
''
that
any
irregularities
took
place
.
The
jury
further
said
in
term-end
presentments
that
the
City
Executive
Committee
,
which
had
over-all
charge
of
the
election
,
``
deserves
the
praise


In [4]:
my_stopwords = set(stopwords.words('english')) #delete stop-words
word_stream = [str(w).lower() for w in brown.words() if w.lower() not in my_stopwords] # make all letters lowercase
my_word_stream = [w for w in word_stream if (len(w) > 1 and w.isalnum())]

In [9]:
len(list(brown.words()))

1161192

In [10]:
len(my_word_stream)

513240

In [11]:
my_word_stream[:20]

['fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'investigation',
 'recent',
 'primary',
 'election',
 'produced',
 'evidence',
 'irregularities',
 'took',
 'place',
 'jury',
 'said',
 'presentments',
 'city',
 'executive']

In [12]:
N = len(my_word_stream) #Co-occurrence probabilities of each word
words = []
totals = {}
for i in range(1, N-1):
    w = my_word_stream[i]
    if w not in words:
        words.append(w)
        totals[w] = 0
    totals[w] = totals[w] + 1

In [14]:
totals

{'county': 155,
 'grand': 48,
 'jury': 67,
 'said': 1961,
 'friday': 60,
 'investigation': 51,
 'recent': 179,
 'primary': 96,
 'election': 77,
 'produced': 90,
 'evidence': 204,
 'irregularities': 8,
 'took': 426,
 'place': 570,
 'presentments': 1,
 'city': 393,
 'executive': 55,
 'committee': 168,
 'charge': 122,
 'deserves': 16,
 'praise': 17,
 'thanks': 37,
 'atlanta': 35,
 'manner': 124,
 'conducted': 55,
 'term': 79,
 'charged': 57,
 'fulton': 16,
 'superior': 46,
 'court': 230,
 'judge': 77,
 'durwood': 1,
 'pye': 1,
 'investigate': 11,
 'reports': 84,
 'possible': 374,
 'ivan': 4,
 'allen': 20,
 'relative': 46,
 'handful': 13,
 'received': 163,
 'considering': 47,
 'widespread': 30,
 'interest': 330,
 'number': 472,
 'voters': 20,
 'size': 138,
 'find': 400,
 'many': 1030,
 'registration': 23,
 'laws': 88,
 'outmoded': 4,
 'inadequate': 32,
 'often': 369,
 'ambiguous': 22,
 'recommended': 46,
 'legislators': 20,
 'act': 283,
 'studied': 79,
 'revised': 16,
 'end': 409,
 'modern

In [15]:
vocab_words = [w for w in words if totals[w] > 19]
context_words = [w for w in words if totals[w] > 99]

In [16]:
print('Vocabulary words: ', len(vocab_words), ' Context words: ', len(context_words))

Vocabulary words:  4720  Context words:  918


### Count the co-occurrence of words:

Define an analysis window, for example window_size = 2. 

- Let $w_0$ be any word in vocab_words and $w$ any word in context_words.
- Each time $w_0$ occurs in the corpus, look at the window of window_size words before and after it. If $w$ appears in this window, we say it appears in the context of (this particular occurrence of) $w_0$.
- Define counts[w0][w] as the total number of times $w$ occurs in the context of $w_0$.


In [17]:
def get_counts(window_size=2):
    counts = {}
    for w0 in vocab_words:
        counts[w0] = {}
    for i in range(window_size, N-window_size):
        w0 = my_word_stream[i]
        if w0 in vocab_words:
            for j in (list(range(-window_size,0)) + list(range(1,window_size+1))):
                w = my_word_stream[i+j]
                if w in context_words:
                    if w not in counts[w0].keys():
                        counts[w0][w] = 1
                    else:
                        counts[w0][w] = counts[w0][w] + 1
    return counts

In [18]:
def get_co_occurrence_dictionary(counts):
    probs = {}
    for w0 in counts.keys():
        sum = 0
        for w in counts[w0].keys():
            sum = sum + counts[w0][w]
        if sum > 0:
            probs[w0] = {}
            for w in counts[w0].keys():
                probs[w0][w] = float(counts[w0][w])/float(sum)
    return probs

In [19]:
def get_context_word_distribution(counts):
    counts_context = {}
    sum_context = 0
    context_frequency = {}
    for w in context_words:
        counts_context[w] = 0
    for w0 in counts.keys():
        for w in counts[w0].keys():
            counts_context[w] = counts_context[w] + counts[w0][w]
            sum_context = sum_context + counts[w0][w]
    for w in context_words:
        context_frequency[w] = float(counts_context[w])/float(sum_context)
    return context_frequency

Compute the pointwise mutual information matrix:

`PMI[i,j] = MAX(0, log probs[ith vocab word][jth context word] - log context_frequency[jth context word])`

The embedding of any word can then be taken as the corresponding row of this matrix. However, to reduce the dimension, we will apply PCA.

In [20]:
print ("Computing counts and distributions")
counts = get_counts(2)
probs = get_co_occurrence_dictionary(counts)
context_frequency = get_context_word_distribution(counts)
#
print ("Computing pointwise mutual information")
n_vocab = len(vocab_words)
n_context = len(context_words)
pmi = np.zeros((n_vocab, n_context))
for i in range(0, n_vocab):
    w0 = vocab_words[i]
    for w in probs[w0].keys():
        j = context_words.index(w)
        pmi[i,j] = max(0.0, np.log(probs[w0][w]) - np.log(context_frequency[w]))

Computing counts and distributions
Computing pointwise mutual information


In [21]:
pca = PCA(n_components=100)
vecs = pca.fit_transform(pmi)
for i in range(0,n_vocab):
    vecs[i] = vecs[i]/np.linalg.norm(vecs[i])

In [22]:
fd = open("embedding.pickle", "wb")
pickle.dump(vocab_words, fd)
pickle.dump(context_words, fd)
pickle.dump(vecs, fd)
fd.close()

### Now compute the nearest neighbor of any word in the vocabulary:

In [23]:
def word_NN(w):
    if not(w in vocab_words):
        print ("Unknown word")
        return
    v = vecs[vocab_words.index(w)]
    neighbor = 0
    curr_dist = np.linalg.norm(v - vecs[0])
    for i in range(1, n_vocab):
        dist = np.linalg.norm(v - vecs[i])
        if (dist < curr_dist) and (dist > 0.0):
            neighbor = i
            curr_dist = dist
    return vocab_words[neighbor]

In [36]:
word_NN('knowledge')

'shoulders'

In [41]:
word_NN('pulmonary')

'artery'

In [46]:
word_NN('ground')

'water'

In [49]:
word_NN('sun')

'sky'

In [51]:
word_NN('two')

'three'