# Word2Vec

For calculating the distance between sentences and between keywords, I need a vector representation of the words. I will train a Word2Vec model. SkipGram predicts the context from the word (context a word can occur in depends on its meaning), while CBOW predicts the word from the context (where a word can show up depends on the meaning of the context). CBOW similarity should give you synonyms: if two words occur in the same context (they are synonyms for instance), they have very similar vectors. 

Preprocessing: mix of the feature engineering and topic modeling tokenizer.
* lowercase
* glue words together (replace '-' by nothing)
* take out digits
* print in file: one sentence per line.

Parameters:
* avoid sparse matrix: ignore very infrequent words
* window size = 4 has been shown to be optimal for English
* Dimension between 300-600. Not that relevant.

In [1]:
from gensim.models import Word2Vec
from gensim.models import phrases
from pymongo import MongoClient
import pandas as pd
import en_core_web_sm
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from sklearn.externals import joblib
import spacy
import re
import string
from multiprocessing import Pool
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
standard_stopwords = set(list(ENGLISH_STOP_WORDS)+list(stopwords.words('english')))

In [3]:
nlp = spacy.load('en_core_web_sm')
authors = joblib.load('authors')

In [4]:
def build_bigram_model(corpus):
    corpus = is_english(nlp(corpus))
    bigram = phrases.Phrases(corpus, min_count=20, threshold=17)
    bigram_phraser = phrases.Phraser(bigram)
    return bigram_phraser

def is_english(doc):
    """determines whether a word is English/author"""
    english_words = []
    for sent in doc.sents:
        sentence = []
        for w in str(sent).split():
            w = str(w).lower()
            if w in authors:
                sentence.append(w)
            else: 
                try:
                    w.encode(encoding='utf-8').decode('ascii')
                        # if re.sub('-', '', word).isalpha():
                            # english_words.append(re.sub('[%s]' % re.escape(string.punctuation), '', word))
                    word = re.sub('[%s]' % re.escape(string.punctuation), '', w)
                    if word.isalpha():
                        sentence.append(word)
                except UnicodeDecodeError:
                    pass
        english_words.append(sentence)
    return english_words
        
def prepare_word2vec(corpus):
    transf = nlp(corpus)
    sentences = is_english(transf)
    return sentences

In [2]:
client = MongoClient()
db = client.lingbuzz
papers = db.get_collection('papers')

In [7]:
corpus = str()
for doc in papers.find({'paper':{'$exists': True}}):
    corpus+=doc['paper']

In [None]:
bigrams = build_bigram_model(corpus)

In [None]:
len(bigrams.phrasegrams)

In [None]:
bigrams.save('bigrams_model')

In [8]:
bigrams = joblib.load('bigrams_model')

In [9]:
# sentences is list of list of words. 
model = Word2Vec(bigrams[is_english(nlp(corpus))], size=300, window=4, workers=20)
# good size is square root of length of vocabulary. Playing around with parameters does not change a thing.

In [None]:
model.wv.most_similar('focus')

In [11]:
model.save('word2vec')

In [None]:
text_file = open("vocabulary.txt", "wb")
for word in model.wv.vocab.keys():
    text_file.write(word.encode('utf-8') + '\n'.encode('utf-8'))
text_file.close()

The results of my word2vec model are bad. Probably because I do not have much data. Let's try the pretrained fastText model. It is trained based on character n-grams instead of on words, and can deal with OOV words and typos and other noisy things. With other pretrained models, OOV words have to be ignored, or assigned a random vector. Neither is a good option.

In [None]:
from gensim.models.wrappers import FastText

In [None]:
fasttext_model = FastText.load_fasttext_format('../fastText/wiki.en.bin')

In [None]:
fasttext_model.wv.similarity('fronting', 'movement')

WAAAW! This is amazing. It can even deal with words in other languages! And it does so much way more better than my stupid word2vec model. 'Fronting' is an instance of 'movement', and acording to word2vec, the meanings are kind of orthogonal.

In [None]:
model.wv.similarity('fronting', 'movement')

In [None]:
fasttext_model.wv.most_similar('it')

In [None]:
model.wv.most_similar('it')

## Sentence vectors

To calculate sentence vectors: I made a file with all the vectors for my vocabulary, calculated by fastText. I will read it into a dictionary.  
Vector of sentence = weighted mean of its word vectors (https://openreview.net/pdf?id=SyK00v5xx).  
To calculate word or sentence similarity: relative cosine distance (https://ufal.mff.cuni.cz/pbml/105/art-leeuwenberg-et-al.pdf).  
Add sentence id to the keyword database, to know which words are in which sentence.

In [3]:
voc_vectors = {}
with open('voc_vectors.txt', 'rb') as f:
    content = f.readlines()

In [4]:
for line in content:
    line = line.decode("utf-8").split(" ", 1)
    voc_vectors[line[0]] = {'vector': np.fromstring(line[1].strip(), sep=' '), 'sentenceIDs' : []}

In [16]:
#debugged
def is_english_sentence(sent):
    """determines whether a word is English/author"""
    sentence = []
    for w in str(sent).split():
        w = str(w).lower()
        if w in authors:
            sentence.append(w)
        else: 
            try:
                w.encode(encoding='utf-8').decode('ascii')
                    # if re.sub('-', '', word).isalpha():
                        # english_words.append(re.sub('[%s]' % re.escape(string.punctuation), '', word))
                word = re.sub('[%s]' % re.escape(string.punctuation), '', w)
                if word.isalpha():
                    sentence.append(word)
            except UnicodeDecodeError:
                pass
    return sentence

In [5]:
#debugged
def avg_feature_vector(words, word_to_vec, id_, num_features = 300):
    """words is list of words, num_features in dimension of vector, word_to_vec is dict with word:vector
    appends sentence ids to word_to_vec so we can quickly recover which words are in which sentences
    returns average feature vector for the sentence"""
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    for word in words:
        if word in word_to_vec:
            nwords = nwords+1
            featureVec = np.add(featureVec, word_to_vec[word]['vector'])
            word_to_vec[word]['sentenceIDs'].append(id_)
    if(nwords>0):
        featureVec = np.divide(featureVec, nwords)
    return featureVec

In [4]:
sentences = db.get_collection('sentences')

In [7]:
sentences.count()

631676

In order to save time, I will have to manually parallelize. First, put everything in df. Then partition the df and run each sentence through the bigrammizer, this is that part that takes the longest. 
Then, calculate the average sentence vectors and the sentence similarities.

In [11]:
# the formula used here is wrong. According to the paper it should be the sum of the similarities 
# between the top n most similar words...

def create_df_rel_cs(vectors, ids):
    """calculates relative cosine distance between two sentences and returns df with sentenceids and their distance"""
    cos_sim = cosine_similarity(np.asarray(vectors))
    # sum_cs = np.sum(cos_sim, 1)[0]
    # rel_cs = cos_sim / sum_cs
    df = pd.DataFrame(cos_sim, index = ids, columns = ids)
    return df

In [3]:
keywords = db.get_collection('keywords')

In [33]:
# udpate word vectors dict with sentence IDs and calculate sentence vectors
vectors = []
id_s = []
for sent in sentences.find():
    id_s.append(sent['_id'])
    sentence = bigrams[is_english_sentence(sent['sentence'].split())]
    vectors.append(avg_feature_vector(sentence, voc_vectors, sent['_id']))

df = create_df_rel_cs(vectors, id_s)

# store sentence ids in keyword db
for entry in voc_vectors:
    keywords.insert_one({'word': entry, 'sentenceIDs': list(set(voc_vectors[entry]['sentenceIDs']))})

# save distances as csv to figure out which the similarity threshold is we are going to use
df.to_csv('sentence_distances.csv')
joblib.dump(voc_vectors, 'voc_vectors_dict')

MemoryError: 

In [35]:
joblib.dump(vectors, 'sentence_vectors')
joblib.dump(id_s, 'sentence_ids')

['sentence_ids']

In [36]:
joblib.dump(voc_vectors, 'voc_vectors_dict')

['voc_vectors_dict']

Store sentence IDs in word database:

In [43]:
for entry in voc_vectors:
    keywords.insert_one({'word': entry, 'sentenceIDs': list(set(voc_vectors[entry]['sentenceIDs']))})

In [9]:
vectors = joblib.load('sentence_vectors')
id_s = joblib.load('sentence_ids')

In [14]:
np.asarray(vectors).shape

(631709, 300)

In [23]:
from bson import ObjectId
keywords = db.get_collection('keywords')
for id_ in keywords.find()[0]['similar_words']:
    print(repr(ObjectId(id_)))
    for doc in keywords.find({'_id': repr(ObjectId(id_))}):
        print (doc['word'])

ObjectId('59aea9b7b18b14a4ed17413f')
ObjectId('59aea9b9b18b14a4ed1742a3')
ObjectId('59aea9bab18b14a4ed174519')
ObjectId('59aea9bbb18b14a4ed17477e')
ObjectId('59aea9bdb18b14a4ed174ee7')
ObjectId('59aea9beb18b14a4ed175747')
ObjectId('59aea9beb18b14a4ed175bba')
ObjectId('59aea9c5b18b14a4ed17a35c')
ObjectId('59aea9c5b18b14a4ed17abf5')
ObjectId('59aea9c6b18b14a4ed17b1f3')
ObjectId('59aea9c6b18b14a4ed17b5bd')
ObjectId('59aea9c6b18b14a4ed17b7d6')


In [6]:
keywords.find({'informative_sents':{'$exists': True}}).count()

2

In [7]:
keywords.find()[1]

{'_id': ObjectId('59aea9b2b18b14a4ed17403a'),
 'frequency': 453,
 'informative_sents': [ObjectId('59a86b03b18b1408616c6852'),
  ObjectId('59aa933bb18b14085d6d0e39'),
  ObjectId('59aaa9e8b18b1408626d42a9'),
  ObjectId('59a867d6b18b14085e6c6aa7'),
  ObjectId('59a85edfb18b14085b6c5cfa'),
  ObjectId('59a89165b18b14085f6c91d1'),
  ObjectId('59a86b39b18b1408616c68d1'),
  ObjectId('59aa9135b18b14085d6d0dd8'),
  ObjectId('59a9ed44b18b14085b6d2f1a'),
  ObjectId('59a89acfb18b14085b6c6e91'),
  ObjectId('59ab32e2b18b14085c6cef93'),
  ObjectId('59abb684b18b1408636cf3e9'),
  ObjectId('59aa333bb18b14085d6cf681'),
  ObjectId('59a8a31bb18b14085f6ca6c9'),
  ObjectId('59aa93fcb18b14085d6d0e5d'),
  ObjectId('59aa8e26b18b14085d6d0d46'),
  ObjectId('59a9b21ab18b14085b6d0b0a'),
  ObjectId('59aa7c17b18b1408626d3a6c'),
  ObjectId('59a86100b18b14085b6c5e03'),
  ObjectId('59ab84c8b18b1408616d5b1d'),
  ObjectId('59a864eab18b14085f6c648e'),
  ObjectId('59a8711ab18b14085d6c7029'),
  ObjectId('59a87438b18b1408616c75