In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import Word2Vec

In [2]:
dataset = catalog.load('raw_dataset')
vocab = catalog.load('dictionary')

2020-04-03 10:58:06,028 - kedro.io.data_catalog - INFO - Loading data from `raw_dataset` (CSVLocalDataSet)...


## Pre-process docs for word embeddings

In [3]:
def split_by_sentence(docs):
    tmp = []
    for i, doc in enumerate(docs):
        splitted_doc = doc.split('.\n')
        for sd in splitted_doc:
            sentences = sd.split('. ')
            for s in sentences:
                tmp.append(s)
    return tmp

def lowerize(docs):
    # Convert to lowercase.
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()
    return docs

def tokenize(docs):
    # Split into words.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = tokenizer.tokenize(docs[idx])
    return docs

def remove_stop_words(docs):
    stop_words = set(stopwords.words('english'))
    for idx in range(len(docs)):
        docs[idx] = [w for w in docs[idx] if not w in stop_words]
    return docs

def remove_numbers(docs):
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    return docs

def remove_word_with_length(docs, length=1):
    # Remove words that are only (length=1) character.
    docs = [[token for token in doc if len(token) > length] for doc in docs]
    return docs

def lemmatize(docs):
    # Lemmatize the documents
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    return docs

def add_bigram(docs, min_bigram_count=20):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=min_bigram_count)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs

def remove_vocab(docs, vocab):
    docs = np.array([[w for w in doc if w in vocab] for doc in docs])
    return docs

In [4]:
min_bigram_count = 20
length = 1
no_below = 100
no_above = 0.70

docs = dataset['text'].values

print('\nSplitting by sentence...')
docs = split_by_sentence(docs)

print('\nLowerizing...')
docs = lowerize(docs)

print('\nTokenizing...')
docs = tokenize(docs)

#print('\nAdding bigrams...')
#docs = add_bigram(docs, min_bigram_count=min_bigram_count)
    
print('\nRemoving stop words...')
docs = remove_stop_words(docs)

print('\nRemoving unique numbers (not words that contain numbers)...')
docs = remove_numbers(docs)

print('\nRemoving words that contain only one character...')
docs = remove_word_with_length(docs, length=length)

print('\nLemmatizing...')
docs = lemmatize(docs)

vocab = Dictionary(docs)
vocab.filter_extremes(no_below=no_below, no_above=no_above)

docs = remove_vocab(docs, list(vocab.token2id))

print('Number of sentences:', len(docs))
print('Number of unique words:', len(vocab))


Splitting by sentence...

Lowerizing...

Tokenizing...

Removing stop words...

Removing unique numbers (not words that contain numbers)...

Removing words that contain only one character...

Lemmatizing...


## Train word embeddings (gensim word2vec)

In [9]:
class MyDocuments(object):
    def __init__(self, docs):
        self.docs = docs
 
    def __iter__(self):
        for line in self.docs:
            yield line

In [10]:
sentences = MyDocuments(docs)

In [11]:
size = 300
window = 5
min_count = 1
workers = 8
sg = 1
iter = 50

model = Word2Vec(sentences, size=size, window=window, min_count=min_count, workers=workers, sg=sg, iter=iter)

2020-03-31 18:38:23,541 - gensim.models.word2vec - INFO - collecting all words and their counts
2020-03-31 18:38:23,544 - gensim.models.word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-31 18:38:23,598 - gensim.models.word2vec - INFO - PROGRESS: at sentence #10000, processed 152352 words, keeping 7500 word types
2020-03-31 18:38:23,622 - gensim.models.word2vec - INFO - PROGRESS: at sentence #20000, processed 301964 words, keeping 8172 word types
2020-03-31 18:38:23,646 - gensim.models.word2vec - INFO - PROGRESS: at sentence #30000, processed 459245 words, keeping 8488 word types
2020-03-31 18:38:23,668 - gensim.models.word2vec - INFO - PROGRESS: at sentence #40000, processed 595763 words, keeping 8656 word types
2020-03-31 18:38:23,693 - gensim.models.word2vec - INFO - PROGRESS: at sentence #50000, processed 744128 words, keeping 8708 word types
2020-03-31 18:38:23,720 - gensim.models.word2vec - INFO - PROGRESS: at sentence #60000, processed 90

2020-03-31 18:38:25,069 - gensim.models.word2vec - INFO - PROGRESS: at sentence #590000, processed 8861712 words, keeping 8867 word types
2020-03-31 18:38:25,094 - gensim.models.word2vec - INFO - PROGRESS: at sentence #600000, processed 9012425 words, keeping 8867 word types
2020-03-31 18:38:25,117 - gensim.models.word2vec - INFO - PROGRESS: at sentence #610000, processed 9160548 words, keeping 8867 word types
2020-03-31 18:38:25,144 - gensim.models.word2vec - INFO - PROGRESS: at sentence #620000, processed 9321236 words, keeping 8867 word types
2020-03-31 18:38:25,168 - gensim.models.word2vec - INFO - PROGRESS: at sentence #630000, processed 9479604 words, keeping 8867 word types
2020-03-31 18:38:25,193 - gensim.models.word2vec - INFO - PROGRESS: at sentence #640000, processed 9641385 words, keeping 8867 word types
2020-03-31 18:38:25,218 - gensim.models.word2vec - INFO - PROGRESS: at sentence #650000, processed 9793223 words, keeping 8867 word types
2020-03-31 18:38:25,241 - gensim.m

2020-03-31 18:38:51,360 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:38:51,361 - gensim.models.base_any2vec - INFO - EPOCH - 1 : training on 11991485 raw words (11278610 effective words) took 24.3s, 463825 effective words/s
2020-03-31 18:38:52,392 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 3.44% examples, 386262 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:38:53,451 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 7.82% examples, 424739 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:38:54,476 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 12.25% examples, 438922 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:38:55,556 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 16.70% examples, 444503 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:38:56,565 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 20.81% examples, 450156 words/s, in_qsize 15, out_qsize 

2020-03-31 18:39:41,576 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-03-31 18:39:41,620 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-03-31 18:39:41,626 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-03-31 18:39:41,629 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-03-31 18:39:41,643 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-03-31 18:39:41,645 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-03-31 18:39:41,657 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:39:41,675 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:39:41,676 - gensim.models.

2020-03-31 18:40:25,752 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 66.35% examples, 433408 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:40:26,762 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 69.94% examples, 430056 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:40:27,764 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 73.12% examples, 426268 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:40:28,781 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 77.28% examples, 428012 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:40:29,800 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 81.24% examples, 429617 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:40:30,832 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 85.57% examples, 431156 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:40:31,864 - gensim.models.base_any2vec - INFO - EPOCH 5 - PROGRESS: at 89.56% examples, 431847 words/s, in_qsize 15, out_qsize 0

2020-03-31 18:41:10,135 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 39.58% examples, 435556 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:11,146 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 42.59% examples, 429004 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:12,147 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 45.99% examples, 427029 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:13,153 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 49.76% examples, 427956 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:14,156 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 53.83% examples, 430174 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:41:15,243 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 57.97% examples, 430993 words/s, in_qsize 16, out_qsize 2
2020-03-31 18:41:16,264 - gensim.models.base_any2vec - INFO - EPOCH 7 - PROGRESS: at 62.63% examples, 433904 words/s, in_qsize 15, out_qsize 0

2020-03-31 18:41:53,905 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 14.43% examples, 392722 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:54,918 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 18.54% examples, 406521 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:55,963 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 23.09% examples, 416700 words/s, in_qsize 15, out_qsize 2
2020-03-31 18:41:57,024 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 27.44% examples, 425717 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:58,026 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 31.65% examples, 432213 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:41:59,031 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 35.53% examples, 434922 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:42:00,033 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 39.90% examples, 439076 words/s, in_qsize 15, out_qsize 0

2020-03-31 18:42:39,693 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:42:39,728 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:42:39,729 - gensim.models.base_any2vec - INFO - EPOCH - 10 : training on 11991485 raw words (11279150 effective words) took 24.3s, 463904 effective words/s
2020-03-31 18:42:40,772 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 3.68% examples, 408929 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:42:41,806 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 8.04% examples, 440714 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:42:42,811 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 12.77% examples, 461781 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:42:43,817 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 16.78% examples, 458324 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:42

2020-03-31 18:43:28,697 - gensim.models.base_any2vec - INFO - EPOCH 12 - PROGRESS: at 96.57% examples, 446429 words/s, in_qsize 15, out_qsize 1
2020-03-31 18:43:29,302 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-03-31 18:43:29,337 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-03-31 18:43:29,346 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-03-31 18:43:29,348 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-03-31 18:43:29,349 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-03-31 18:43:29,361 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-03-31 18:43:29,406 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:43

2020-03-31 18:44:13,084 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 71.83% examples, 438249 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:44:14,101 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 76.35% examples, 440393 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:44:15,113 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 80.18% examples, 441146 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:44:16,122 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 84.37% examples, 442273 words/s, in_qsize 15, out_qsize 1
2020-03-31 18:44:17,134 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 88.52% examples, 443227 words/s, in_qsize 16, out_qsize 0
2020-03-31 18:44:18,154 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 92.64% examples, 443964 words/s, in_qsize 13, out_qsize 2
2020-03-31 18:44:19,186 - gensim.models.base_any2vec - INFO - EPOCH 14 - PROGRESS: at 97.11% examples, 445210 words/s, in_qsize 15, out_

2020-03-31 18:44:57,727 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 52.41% examples, 445985 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:44:58,729 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 55.52% examples, 440488 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:44:59,777 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 58.71% examples, 433779 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:45:00,819 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 63.20% examples, 435353 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:45:01,848 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 67.55% examples, 437209 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:45:02,864 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 71.83% examples, 439135 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:45:03,908 - gensim.models.base_any2vec - INFO - EPOCH 16 - PROGRESS: at 76.21% examples, 439659 words/s, in_qsize 15, out_

2020-03-31 18:45:42,003 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 29.57% examples, 462763 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:45:43,005 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 33.73% examples, 463587 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:45:44,009 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 37.99% examples, 464896 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:45:45,032 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 41.89% examples, 463405 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:45:46,064 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 45.92% examples, 463586 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:45:47,071 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 50.01% examples, 464631 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:45:48,072 - gensim.models.base_any2vec - INFO - EPOCH 18 - PROGRESS: at 54.24% examples, 465741 words/s, in_qsize 15, out_

2020-03-31 18:46:26,220 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 7.90% examples, 444291 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:46:27,266 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 12.61% examples, 458302 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:46:28,272 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 16.78% examples, 460378 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:46:29,336 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 20.90% examples, 458062 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:46:30,385 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 25.43% examples, 459299 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:46:31,405 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 29.35% examples, 459684 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:46:32,458 - gensim.models.base_any2vec - INFO - EPOCH 20 - PROGRESS: at 33.57% examples, 459189 words/s, in_qsize 15, out_q

2020-03-31 18:47:14,065 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-03-31 18:47:14,080 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-03-31 18:47:14,091 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-03-31 18:47:14,117 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:47:14,122 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:47:14,123 - gensim.models.base_any2vec - INFO - EPOCH - 21 : training on 11991485 raw words (11280851 effective words) took 24.8s, 455731 effective words/s
2020-03-31 18:47:15,154 - gensim.models.base_any2vec - INFO - EPOCH 22 - PROGRESS: at 2.80% examples, 313595 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:47:16,181 - gensim.models.base_any2vec - INFO - EPOCH 22 - PROGRESS: at 5

2020-03-31 18:48:00,189 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 83.46% examples, 441161 words/s, in_qsize 13, out_qsize 2
2020-03-31 18:48:01,215 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 87.83% examples, 442726 words/s, in_qsize 13, out_qsize 2
2020-03-31 18:48:02,251 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 92.16% examples, 444382 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:48:03,307 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 96.56% examples, 445166 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:03,887 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-03-31 18:48:03,912 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-03-31 18:48:03,920 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-03-31 18:48:03,927 - gensim.models.base_any2vec - INFO - wo

2020-03-31 18:48:44,611 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 58.72% examples, 433097 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:45,660 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 63.20% examples, 434536 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:46,663 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 67.54% examples, 437105 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:47,671 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 71.74% examples, 438683 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:48,704 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 76.28% examples, 440451 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:49,704 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 80.35% examples, 442850 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:48:50,719 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 84.62% examples, 444178 words/s, in_qsize 15, out_

2020-03-31 18:49:28,846 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 38.00% examples, 461531 words/s, in_qsize 15, out_qsize 3
2020-03-31 18:49:29,866 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 42.12% examples, 463213 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:49:30,890 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 46.29% examples, 465471 words/s, in_qsize 16, out_qsize 0
2020-03-31 18:49:31,892 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 50.28% examples, 464995 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:49:32,948 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 54.31% examples, 462732 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:49:33,961 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 58.42% examples, 462900 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:49:34,964 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 62.97% examples, 463822 words/s, in_qsize 15, out_

2020-03-31 18:50:13,013 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 16.70% examples, 454267 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:50:14,017 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 20.81% examples, 458632 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:50:15,028 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 25.21% examples, 459546 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:50:16,033 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 29.01% examples, 459528 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:50:17,034 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 33.26% examples, 462062 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:50:18,053 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 37.40% examples, 462717 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:50:19,086 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 41.52% examples, 461935 words/s, in_qsize 16, out_

2020-03-31 18:51:00,275 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-03-31 18:51:00,278 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-03-31 18:51:00,283 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:51:00,298 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:51:00,299 - gensim.models.base_any2vec - INFO - EPOCH - 30 : training on 11991485 raw words (11280330 effective words) took 26.2s, 431194 effective words/s
2020-03-31 18:51:01,337 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 3.67% examples, 410818 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:51:02,366 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 7.88% examples, 433867 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:51:03,377 - gensim.models.base_any2vec - INFO - EP

2020-03-31 18:51:47,866 - gensim.models.base_any2vec - INFO - EPOCH 32 - PROGRESS: at 91.60% examples, 442190 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:51:48,887 - gensim.models.base_any2vec - INFO - EPOCH 32 - PROGRESS: at 95.82% examples, 443288 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:51:49,654 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-03-31 18:51:49,689 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-03-31 18:51:49,699 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-03-31 18:51:49,715 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-03-31 18:51:49,722 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-03-31 18:51:49,734 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more 

2020-03-31 18:52:32,407 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 70.90% examples, 460770 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:52:33,472 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 73.84% examples, 451494 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:52:34,523 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 76.75% examples, 443983 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:52:35,550 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 79.51% examples, 437820 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:52:36,566 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 82.61% examples, 433746 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:52:37,584 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 85.89% examples, 430378 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:52:38,614 - gensim.models.base_any2vec - INFO - EPOCH 34 - PROGRESS: at 89.40% examples, 428734 words/s, in_qsize 15, out_

2020-03-31 18:53:16,241 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 39.34% examples, 436264 words/s, in_qsize 16, out_qsize 0
2020-03-31 18:53:17,250 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 43.23% examples, 438962 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:53:18,255 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 47.10% examples, 441394 words/s, in_qsize 16, out_qsize 0
2020-03-31 18:53:19,269 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 51.25% examples, 442361 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:53:20,280 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 55.20% examples, 443376 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:53:21,293 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 59.59% examples, 446021 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:53:22,297 - gensim.models.base_any2vec - INFO - EPOCH 36 - PROGRESS: at 63.91% examples, 447337 words/s, in_qsize 16, out_

2020-03-31 18:54:00,190 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 16.62% examples, 450845 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:01,197 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 19.79% examples, 435471 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:02,199 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 23.27% examples, 423978 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:03,205 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 26.42% examples, 415511 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:04,218 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 29.25% examples, 405484 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:05,287 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 33.42% examples, 409537 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:06,299 - gensim.models.base_any2vec - INFO - EPOCH 38 - PROGRESS: at 37.61% examples, 415667 words/s, in_qsize 13, out_

2020-03-31 18:54:47,438 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-03-31 18:54:47,442 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-03-31 18:54:47,447 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:54:47,477 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:54:47,478 - gensim.models.base_any2vec - INFO - EPOCH - 39 : training on 11991485 raw words (11279679 effective words) took 26.2s, 431083 effective words/s
2020-03-31 18:54:48,494 - gensim.models.base_any2vec - INFO - EPOCH 40 - PROGRESS: at 3.51% examples, 400977 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:54:49,516 - gensim.models.base_any2vec - INFO - EPOCH 40 - PROGRESS: at 7.90% examples, 439737 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:54:50,548 - gensim.models.base_any2vec - INFO - EP

2020-03-31 18:55:34,995 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 88.43% examples, 445965 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:55:36,049 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 92.71% examples, 446737 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:55:37,079 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 96.75% examples, 445985 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:55:37,866 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-03-31 18:55:37,902 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-03-31 18:55:37,909 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-03-31 18:55:37,916 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-03-31 18:55:37,920 - gensim.models.base_any2vec - INFO - worker thread finished; aw

2020-03-31 18:56:19,087 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 64.54% examples, 447277 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:56:20,106 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 67.93% examples, 441669 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:56:21,133 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 71.50% examples, 439005 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:56:22,160 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 76.05% examples, 440880 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:56:23,180 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 79.88% examples, 441447 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:56:24,211 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 84.04% examples, 442120 words/s, in_qsize 15, out_qsize 1
2020-03-31 18:56:25,228 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 88.07% examples, 442148 words/s, in_qsize 15, out_

2020-03-31 18:57:02,939 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 34.27% examples, 423426 words/s, in_qsize 16, out_qsize 2
2020-03-31 18:57:03,943 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 38.72% examples, 429479 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:04,966 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 42.51% examples, 431385 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:05,966 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 46.42% examples, 434650 words/s, in_qsize 14, out_qsize 0
2020-03-31 18:57:06,971 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 50.37% examples, 435711 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:07,984 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 54.39% examples, 437115 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:09,005 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 58.80% examples, 440614 words/s, in_qsize 15, out_

2020-03-31 18:57:46,845 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 12.35% examples, 447243 words/s, in_qsize 16, out_qsize 2
2020-03-31 18:57:47,877 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 16.62% examples, 451422 words/s, in_qsize 16, out_qsize 2
2020-03-31 18:57:48,959 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 20.99% examples, 454833 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:49,990 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 25.51% examples, 457921 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:51,010 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 29.50% examples, 459820 words/s, in_qsize 16, out_qsize 0
2020-03-31 18:57:52,027 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 33.81% examples, 462484 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:57:53,048 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 38.09% examples, 463065 words/s, in_qsize 16, out_

2020-03-31 18:58:33,192 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-03-31 18:58:33,197 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-03-31 18:58:33,199 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-03-31 18:58:33,199 - gensim.models.base_any2vec - INFO - EPOCH - 48 : training on 11991485 raw words (11280318 effective words) took 24.3s, 464735 effective words/s
2020-03-31 18:58:34,261 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 3.67% examples, 401602 words/s, in_qsize 14, out_qsize 1
2020-03-31 18:58:35,296 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 7.97% examples, 432212 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:58:36,298 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 12.35% examples, 444135 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:58:37,312 - gensim.models.

2020-03-31 18:59:22,090 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 87.33% examples, 439545 words/s, in_qsize 15, out_qsize 0
2020-03-31 18:59:23,126 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 91.51% examples, 440560 words/s, in_qsize 16, out_qsize 1
2020-03-31 18:59:24,152 - gensim.models.base_any2vec - INFO - EPOCH 50 - PROGRESS: at 95.65% examples, 441248 words/s, in_qsize 13, out_qsize 2
2020-03-31 18:59:24,955 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-03-31 18:59:24,971 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-03-31 18:59:24,973 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-03-31 18:59:24,988 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-03-31 18:59:24,993 - gensim.models.base_any2vec - INFO - worker thread finished; aw

In [12]:
#model.save('./data/04_features/embeddings.bin')

2020-03-31 18:59:25,064 - gensim.utils - INFO - saving Word2Vec object under ./data/04_features/embeddings.bin, separately None
2020-03-31 18:59:25,065 - gensim.utils - INFO - not storing attribute vectors_norm
2020-03-31 18:59:25,066 - gensim.utils - INFO - not storing attribute cum_table
2020-03-31 18:59:25,307 - gensim.utils - INFO - saved ./data/04_features/embeddings.bin


## Get embeddings

In [10]:
model = Word2Vec.load('./data/04_features/word2vec_model.bin')

2020-04-03 11:01:14,552 - gensim.utils - INFO - loading Word2Vec object from ./data/04_features/word2vec_model.bin
2020-04-03 11:01:15,005 - gensim.utils - INFO - loading wv recursively from ./data/04_features/word2vec_model.bin.wv.* with mmap=None
2020-04-03 11:01:15,006 - gensim.utils - INFO - setting ignored attribute vectors_norm to None
2020-04-03 11:01:15,006 - gensim.utils - INFO - loading vocabulary recursively from ./data/04_features/word2vec_model.bin.vocabulary.* with mmap=None
2020-04-03 11:01:15,007 - gensim.utils - INFO - loading trainables recursively from ./data/04_features/word2vec_model.bin.trainables.* with mmap=None
2020-04-03 11:01:15,008 - gensim.utils - INFO - setting ignored attribute cum_table to None
2020-04-03 11:01:15,008 - gensim.utils - INFO - loaded ./data/04_features/word2vec_model.bin


In [11]:
def get_embeddings(model, vocab):
    # Check embeddings
    print('Vocab length:', len(model.wv.vocab))
    print('Embedding size:', model.vector_size)
    print('Most similar to "war" in embedding space:', model.most_similar('war'))
    war, cold = model['war'].reshape((1, -1)), model['cold'].reshape((1, -1))
    print('Cosine distance between "cold" and "war" in embedding space (gensim metric):', model.similarity('cold', 'war'))
    print('Cosine distance between "cold" and "war" in embedding space (sklearn metric):', cosine_similarity(cold, war))

    words = list(vocab.token2id)
    words = [w for w in words if w in model.wv.vocab]

    embeddings = np.array([model[w] for w in words])
    embeddings_norm = preprocessing.normalize(embeddings)

    dict_embeddings = {w:emb for w,emb in zip(words, embeddings)}
    dict_embeddings_norm = {w:emb for w,emb in zip(words, embeddings_norm)}

    return dict_embeddings, dict_embeddings_norm

In [13]:
dict_embeddings, dict_embeddings_norm = get_embeddings(model, vocab)

Vocab length: 4895
Embedding size: 300
2020-04-03 11:01:19,985 - gensim.models.keyedvectors - INFO - precomputing L2-norms of word weight vectors
Most similar to "war" in embedding space: [('world_war', 0.4436487555503845), ('gulf', 0.4100189208984375), ('massacre', 0.4027578830718994), ('serb', 0.36117807030677795), ('britain', 0.34083133935928345), ('bosnian', 0.33297497034072876), ('ottoman', 0.3318420648574829), ('jew', 0.32354503870010376), ('arab', 0.3178195655345917), ('muslim', 0.31602174043655396)]
Cosine distance between "cold" and "war" in embedding space (gensim metric): 0.12926339
Cosine distance between "cold" and "war" in embedding space (sklearn metric): [[0.12926337]]


  """
  
  import sys
  del sys.path[0]


## Eval / Visualize word embeddings

In [None]:
words = list(vocab.token2id)
words = [w for w in words if w in w2v.wv.vocab]
embeddings = np.array([model[w] for w in words])
embeddings_norm = preprocessing.normalize(embeddings)

In [None]:
from sklearn.manifold import TSNE

#X_embedded = TSNE(n_components=2, metric='cosine').fit_transform(embeddings)
X_embedded = TSNE(n_components=2).fit_transform(embeddings_norm)
#Y_embedded = model.predict(embeddings_norm)

In [None]:
plt.figure(figsize=(20, 20))
plt.scatter(X_embedded[:,0], X_embedded[:,1], s=20)
plt.show()