# Sentence modeling
- One of the methods to represent sentences as vectors (Mu et al 2017)
- Computing vector representations of each embedded word, and weight average them using PCA
    - If there are **n** words in a sentence, select **N** words with high explained variance (n>N)
    - Most of "energy" (around 80%) can be containted using only 4 words (N=4) in the original paper (Mu et al 2017)

In [13]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial
from sklearn.decomposition import PCA

In [2]:
sentences = list(gutenberg.sents('shakespeare-hamlet.txt'))   # import the corpus and convert into a list

In [3]:
print('Type of corpus: ', type(sentences))
print('Length of corpus: ', len(sentences))

Type of corpus:  <class 'list'>
Length of corpus:  3106


In [5]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']
['actus', 'primus']
['fran']


In [4]:
for i in range(len(sentences)):
    sentences[i] = [word.lower() for word in sentences[i] if re.match('^[a-zA-Z]+', word)]

In [6]:
print(sentences[0])    # title, author, and year
print(sentences[1])
print(sentences[10])

['the', 'tragedie', 'of', 'hamlet', 'by', 'william', 'shakespeare']
['actus', 'primus']
['fran']


In [9]:
# set threshold to consider only sentences longer than certain integer
threshold = 5

In [10]:
for i in range(len(sentences)):
    if len(sentences[i]) < 5:
        sentences[i] = None

In [11]:
sentences = [sentence for sentence in sentences if sentence is not None] 

In [12]:
print('Length of corpus: ', len(sentences))

Length of corpus:  1442


In [7]:
model = Word2Vec(sentences = sentences, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)

In [8]:
model.init_sims(replace = True)

In [19]:
# converting each word into its vector representation
for i in range(len(sentences)):
    sentences[i] = [model[word] for word in sentences[i]]

In [None]:
print(sentences[0])    # vector representation of first sentence

In [78]:
# define function to compute weighted vector representation of sentence
# parameter 'n' means number of words to be accounted when computing weighted average
def sent_PCA(sentence, n = 2):
    pca = PCA(n_components = n)
    pca.fit(np.array(sentence).transpose())
    variance = np.array(pca.explained_variance_ratio_)
    words = []
    for _ in range(n):
        idx = np.argmax(variance)
        words.append(np.amax(variance) * sentence[idx])
        variance[idx] = 0
    return np.sum(words, axis = 0)

In [80]:
sent_vectorized = []

In [81]:
# computing vector representation of each sentence
for sentence in sentences:
    sent_vectorized.append(sent_PCA(sentence))

In [88]:
# vector representation of first sentence
list(sent_PCA(sentences[0])) == list(sent_vectorized[0])

True

In [83]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [86]:
# similarity between 11th and 101th sentence in the corpus
print(cosine_similarity(sent_vectorized[10], sent_vectorized[100]))

0.980275605104
