In [2]:
!pip install wget
!pip install nltk
!pip install numpy
!pip install sklearn



In [1]:
import os
import gzip
import wget
import collections

import numpy as np

from nltk.corpus import gutenberg
from sklearn.decomposition import TruncatedSVD

print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [2]:
url="http://rgai1.inf.u-szeged.hu/~berend/compsem/en-cbow.vec.gz"
embedding_file_name = url.split('/')[-1]
if not os.path.exists(embedding_file_name):
    filename = wget.download(url)
    print(filename, " got downloaded")

In [3]:
#load embeddings
dimensions=-1
w2i={}
i2w={}
embeddings = []
j=0
for i,l in enumerate(gzip.open(embedding_file_name, 'rt')):
    parts=l.strip().split()
    if i==0:
        dimensions=int(parts[1])
        continue
    vector=list(map(float, parts[1:]))
    if np.linalg.norm(vector) > 0:
        i2w[j]=parts[0]
        w2i[parts[0]]=j
        embeddings.append(vector)
        j+=1
    if i>0 and i%25000==0: print(i)
embeddings = np.array(embeddings)

25000
50000
75000
100000


In [4]:
def normalize_matrix(M):
    row_sums = np.linalg.norm(M, axis=1) + 1e-9
    return M / row_sums[:, np.newaxis]

# Exercise

Modify the code below to calculate the empirical unigram frequencies from our corpus and the appropriate weighting factor derived from it according to the SIF algorithm (using a=1e-4)!

In [5]:
#obtain sentences and words of the desired corpus first
corpus_file='austen-emma.txt'
sentences=gutenberg.sents(corpus_file)

uniform_unigram_freq = {w: 1/len(w2i) for w in w2i}

words = gutenberg.words(corpus_file)
freq = collections.Counter(words)
### MODIFY CODE BELOW ####
empirical_unigram_freq = uniform_unigram_freq

a=1e-4
word_weights = {w:1 for w,p in empirical_unigram_freq.items()}

print(uniform_unigram_freq['walk'], uniform_unigram_freq['the'])
print(empirical_unigram_freq['walk'], empirical_unigram_freq['the'])
print(word_weights['walk'], word_weights['the'])

1.0543296045209654e-05 1.0543296045209654e-05
1.0543296045209654e-05 1.0543296045209654e-05
1 1


In [6]:
def select_sentence_i(sents, i):
    return ' '.join(sents[i])

In [7]:
print(select_sentence_i(sentences, 700))

" You have made her too tall , Emma ," said Mr . Knightley .


In [8]:
def create_sentence_vector(tokens, unigram_weights):
    weighted_sum=np.zeros(64)
    normalizer=1e-15 # avoid dividing by 0
    for t in tokens:
        if t in w2i:
            normalizer+=1
            weighted_sum += unigram_weights[t] * embeddings[w2i[t]]
    return weighted_sum/normalizer
    

In [9]:
sentence_vectors = np.array([create_sentence_vector(s, word_weights) for s in sentences])

In [10]:
def get_most_similar_vector(M, v, k=5):
    row_normalized = normalize_matrix(M)
    similarities = row_normalized @ v
    return similarities, np.argsort(similarities)[-k:]

def get_most_similar_sentence(sentence_embeddings, i, k=5):
    return get_most_similar_vector(sentence_embeddings, sentence_embeddings[i], k)

In [11]:
for test_sentence in [55, 627, 880, 2020]:
    print(select_sentence_i(sentences, test_sentence), "\n", ''.join(50*['-']))
    similarities, argsort = get_most_similar_sentence(sentence_vectors, test_sentence, 3)
    for k, top_id in enumerate(argsort[::-1]):
        print("Sentence with top-{} similarity: {}".format(k+1, select_sentence_i(sentences, top_id)))
    print("=============*****=============")

I only doubt whether he will ever take us anywhere else . 
 --------------------------------------------------
Sentence with top-1 similarity: I only doubt whether he will ever take us anywhere else .
Sentence with top-2 similarity: " Certainly , if you wish it ;-- but you are not going to walk to Highbury alone ?"
Sentence with top-3 similarity: And when you get there , you must tell him at what time you would have him come for you again ; and you had better name an early hour .
Pray , pray attempt it . 
 --------------------------------------------------
Sentence with top-1 similarity: Pray , pray attempt it .
Sentence with top-2 similarity: I long to make apologies , excuses , to urge something for myself .
Sentence with top-3 similarity: " I was just going to tell you of our agreeable surprize in seeing him arrive this morning .
How cheerful , how animated , how suspicious , how busy their imaginations all are !" 
 --------------------------------------------------
Sentence with to

Sentence vectors share a fair amount of directions purely due to function words and stop words.
We can discard this component of the representations by removing the sentence vectors to the principal singular vector $u$.

In [12]:
svd = TruncatedSVD(n_components=1, random_state=0)
svd.fit(sentence_vectors)
u=np.array(svd.components_.T)
print(u.shape, sentence_vectors.shape)

(64, 1) (7752, 64)


Principal vector $u \in R^d$ can be regarded as a word vector itself. We can query those words which have the most similar vectorial representation and see if the principal singular vector indeed encodes grammatical information mostly.

In [13]:
word_similarities, ordering = get_most_similar_vector(embeddings, svd.components_[0])
print("The words with highest similarity to the principal singular vector are: ", [i2w[o] for o in ordering])

The words with highest similarity to the principal singular vector are:  ['actually', 'so', 'if', 'that', 'but']


# Exercise

Modify the code which looks for the most similar sentences, but this time also consider the correction term involving the projection of the sentence vectors to the principal singular vector $u$ in the SIF algorithm.

In [14]:
for test_sentence in [55, 627, 880, 2020]:
    print(select_sentence_i(sentences, test_sentence), "\n", ''.join(50*['-']))
    corrected_sentence_vectors = sentence_vectors  ### CHANGE THIS LINE ###
    similarities, argsort = get_most_similar_sentence(corrected_sentence_vectors, test_sentence, 3)
    for k, top_id in enumerate(argsort[::-1]):
        print("Sentence with top-{} similarity: {}".format(k+1, select_sentence_i(sentences, top_id)))
    print("=============*****=============")

I only doubt whether he will ever take us anywhere else . 
 --------------------------------------------------
Sentence with top-1 similarity: I only doubt whether he will ever take us anywhere else .
Sentence with top-2 similarity: " Certainly , if you wish it ;-- but you are not going to walk to Highbury alone ?"
Sentence with top-3 similarity: And when you get there , you must tell him at what time you would have him come for you again ; and you had better name an early hour .
Pray , pray attempt it . 
 --------------------------------------------------
Sentence with top-1 similarity: Pray , pray attempt it .
Sentence with top-2 similarity: I long to make apologies , excuses , to urge something for myself .
Sentence with top-3 similarity: " I was just going to tell you of our agreeable surprize in seeing him arrive this morning .
How cheerful , how animated , how suspicious , how busy their imaginations all are !" 
 --------------------------------------------------
Sentence with to