In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import Word2Vec

In [2]:
dataset = catalog.load('raw_dataset')
vocab = catalog.load('dictionary')

2020-04-10 13:53:45,023 - kedro.io.data_catalog - INFO - Loading data from `raw_dataset` (CSVLocalDataSet)...
2020-04-10 13:53:45,943 - kedro.io.data_catalog - INFO - Loading data from `dictionary` (DictionaryDataSet)...
2020-04-10 13:53:45,944 - gensim.utils - INFO - loading Dictionary object from data/05_model_input/dictionary.dict
2020-04-10 13:53:45,948 - gensim.utils - INFO - loaded data/05_model_input/dictionary.dict


## Pre-process docs for word embeddings

In [4]:
def split_by_sentence(docs):
    tmp = []
    for i, doc in enumerate(docs):
        splitted_doc = doc.split('.\n')
        for sd in splitted_doc:
            sentences = sd.split('. ')
            for s in sentences:
                tmp.append(s)
    return tmp

def lowerize(docs):
    # Convert to lowercase.
    for idx in range(len(docs)):
        docs[idx] = str(docs[idx]).lower()
    return docs

def tokenize(docs):
    # Split into words.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = tokenizer.tokenize(docs[idx])
    return docs

def remove_stop_words(docs):
    stop_words = set(stopwords.words('english'))
    for idx in range(len(docs)):
        docs[idx] = [w for w in docs[idx] if not w in stop_words]
    return docs

def remove_numbers(docs):
    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    return docs

def remove_word_with_length(docs, length=1):
    # Remove words that are only (length=1) character.
    docs = [[token for token in doc if len(token) > length] for doc in docs]
    return docs

def lemmatize(docs):
    # Lemmatize the documents
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    return docs

def add_bigram(docs, min_bigram_count=20):
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=min_bigram_count)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)
    return docs

def remove_vocab(docs, vocab):
    docs = np.array([[w for w in doc if w in vocab] for doc in docs])
    return docs

In [5]:
min_bigram_count = 20
length = 1
no_below = 100
no_above = 0.70

docs = dataset['text'].values

print('\nSplitting by sentence...')
docs = split_by_sentence(docs)

print('\nLowerizing...')
docs = lowerize(docs)

print('\nTokenizing...')
docs = tokenize(docs)

#print('\nAdding bigrams...')
#docs = add_bigram(docs, min_bigram_count=min_bigram_count)
    
print('\nRemoving stop words...')
docs = remove_stop_words(docs)

print('\nRemoving unique numbers (not words that contain numbers)...')
docs = remove_numbers(docs)

print('\nRemoving words that contain only one character...')
docs = remove_word_with_length(docs, length=length)

print('\nLemmatizing...')
docs = lemmatize(docs)

vocab = Dictionary(docs)
vocab.filter_extremes(no_below=no_below, no_above=no_above)

docs = remove_vocab(docs, list(vocab.token2id))

print('Number of sentences:', len(docs))
print('Number of unique words:', len(vocab))


Splitting by sentence...

Lowerizing...

Tokenizing...

Removing stop words...

Removing unique numbers (not words that contain numbers)...

Removing words that contain only one character...

Lemmatizing...
2020-04-10 13:54:50,021 - gensim.corpora.dictionary - INFO - adding document #0 to Dictionary(0 unique tokens: [])
2020-04-10 13:54:50,207 - gensim.corpora.dictionary - INFO - adding document #10000 to Dictionary(9529 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:50,403 - gensim.corpora.dictionary - INFO - adding document #20000 to Dictionary(12395 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:50,601 - gensim.corpora.dictionary - INFO - adding document #30000 to Dictionary(15222 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:50,775 - gensim.corpora.dictionary - INFO - adding document #40000 to Dictionar

2020-04-10 13:54:58,718 - gensim.corpora.dictionary - INFO - adding document #420000 to Dictionary(36535 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:58,913 - gensim.corpora.dictionary - INFO - adding document #430000 to Dictionary(36785 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:59,119 - gensim.corpora.dictionary - INFO - adding document #440000 to Dictionary(37465 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:59,357 - gensim.corpora.dictionary - INFO - adding document #450000 to Dictionary(38154 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:59,575 - gensim.corpora.dictionary - INFO - adding document #460000 to Dictionary(38527 unique tokens: ['ambassador', 'assembly', 'congratulation', 'delegation', 'election']...)
2020-04-10 13:54:59,778 -

Number of sentences: 798974
Number of unique words: 6935


## Train word embeddings (gensim word2vec)

In [6]:
class MyDocuments(object):
    def __init__(self, docs):
        self.docs = docs
 
    def __iter__(self):
        for line in self.docs:
            yield line

In [7]:
sentences = MyDocuments(docs)

In [8]:
size = 300
window = 5
min_count = 1
workers = 8
sg = 1
iter = 50

model = Word2Vec(sentences, size=size, window=window, min_count=min_count, workers=workers, sg=sg, iter=iter)

2020-04-10 13:59:34,018 - gensim.models.word2vec - INFO - collecting all words and their counts
2020-04-10 13:59:34,019 - gensim.models.word2vec - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-10 13:59:34,042 - gensim.models.word2vec - INFO - PROGRESS: at sentence #10000, processed 137391 words, keeping 6041 word types
2020-04-10 13:59:34,068 - gensim.models.word2vec - INFO - PROGRESS: at sentence #20000, processed 272256 words, keeping 6527 word types
2020-04-10 13:59:34,097 - gensim.models.word2vec - INFO - PROGRESS: at sentence #30000, processed 416218 words, keeping 6719 word types
2020-04-10 13:59:34,125 - gensim.models.word2vec - INFO - PROGRESS: at sentence #40000, processed 540054 words, keeping 6814 word types
2020-04-10 13:59:34,154 - gensim.models.word2vec - INFO - PROGRESS: at sentence #50000, processed 674162 words, keeping 6846 word types
2020-04-10 13:59:34,182 - gensim.models.word2vec - INFO - PROGRESS: at sentence #60000, processed 81

2020-04-10 13:59:35,538 - gensim.models.word2vec - INFO - PROGRESS: at sentence #590000, processed 8014610 words, keeping 6934 word types
2020-04-10 13:59:35,564 - gensim.models.word2vec - INFO - PROGRESS: at sentence #600000, processed 8150694 words, keeping 6934 word types
2020-04-10 13:59:35,588 - gensim.models.word2vec - INFO - PROGRESS: at sentence #610000, processed 8284559 words, keeping 6934 word types
2020-04-10 13:59:35,612 - gensim.models.word2vec - INFO - PROGRESS: at sentence #620000, processed 8429663 words, keeping 6934 word types
2020-04-10 13:59:35,636 - gensim.models.word2vec - INFO - PROGRESS: at sentence #630000, processed 8573121 words, keeping 6934 word types
2020-04-10 13:59:35,659 - gensim.models.word2vec - INFO - PROGRESS: at sentence #640000, processed 8720660 words, keeping 6934 word types
2020-04-10 13:59:35,684 - gensim.models.word2vec - INFO - PROGRESS: at sentence #650000, processed 8858603 words, keeping 6934 word types
2020-04-10 13:59:35,706 - gensim.m

2020-04-10 14:00:00,270 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 3.90% examples, 396826 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:01,325 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 8.44% examples, 415023 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:00:02,341 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 13.17% examples, 426294 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:03,353 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 17.45% examples, 427681 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:04,357 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 21.90% examples, 431058 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:05,386 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 26.43% examples, 433203 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:06,397 - gensim.models.base_any2vec - INFO - EPOCH 2 - PROGRESS: at 30.62% examples, 433278 words/s, in_qsize 15, out_qsize 0
2

2020-04-10 14:00:47,290 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-04-10 14:00:47,291 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-04-10 14:00:47,309 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-04-10 14:00:47,310 - gensim.models.base_any2vec - INFO - EPOCH - 3 : training on 10849021 raw words (10117310 effective words) took 23.3s, 434829 effective words/s
2020-04-10 14:00:48,334 - gensim.models.base_any2vec - INFO - EPOCH 4 - PROGRESS: at 3.80% examples, 385484 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:49,349 - gensim.models.base_any2vec - INFO - EPOCH 4 - PROGRESS: at 8.25% examples, 412966 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:50,375 - gensim.models.base_any2vec - INFO - EPOCH 4 - PROGRESS: at 13.07% examples, 426849 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:00:51,381 - gensim.models.base

2020-04-10 14:01:33,399 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-04-10 14:01:33,405 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-04-10 14:01:33,414 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-04-10 14:01:33,419 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-04-10 14:01:33,425 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-04-10 14:01:33,426 - gensim.models.base_any2vec - INFO - EPOCH - 5 : training on 10849021 raw words (10116850 effective words) took 23.0s, 438919 effective words/s
2020-04-10 14:01:34,471 - gensim.models.base_any2vec - INFO - EPOCH 6 - PROGRESS: at 4.01% examples, 395915 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:01:35,485 - gensim.models.base_any2vec - INFO - EPOCH 6 - PROGRESS: at 8.45

2020-04-10 14:02:19,426 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-04-10 14:02:19,466 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-04-10 14:02:19,478 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-04-10 14:02:19,489 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-04-10 14:02:19,508 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-04-10 14:02:19,529 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-04-10 14:02:19,530 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 0 more threads
2020-04-10 14:02:19,531 - gensim.models.base_any2vec - INFO - EPOCH - 7 : training on 10849021 raw words (10116236 effective words) took 23.0s, 440044 effective

2020-04-10 14:03:04,900 - gensim.models.base_any2vec - INFO - EPOCH 9 - PROGRESS: at 96.74% examples, 438935 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:03:05,430 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-04-10 14:03:05,438 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-04-10 14:03:05,498 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-04-10 14:03:05,510 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-04-10 14:03:05,521 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 3 more threads
2020-04-10 14:03:05,524 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 2 more threads
2020-04-10 14:03:05,526 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 1 more threads
2020-04-10 14:03:

2020-04-10 14:03:48,845 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 87.47% examples, 436785 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:03:49,846 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 91.80% examples, 437253 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:03:50,859 - gensim.models.base_any2vec - INFO - EPOCH 11 - PROGRESS: at 96.23% examples, 437428 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:03:51,494 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-04-10 14:03:51,511 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-04-10 14:03:51,549 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 5 more threads
2020-04-10 14:03:51,560 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 4 more threads
2020-04-10 14:03:51,570 - gensim.models.base_any2vec - INFO - worker thread finished; aw

2020-04-10 14:04:33,013 - gensim.models.base_any2vec - INFO - EPOCH 13 - PROGRESS: at 78.84% examples, 437080 words/s, in_qsize 15, out_qsize 3
2020-04-10 14:04:34,017 - gensim.models.base_any2vec - INFO - EPOCH 13 - PROGRESS: at 83.27% examples, 438031 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:04:35,027 - gensim.models.base_any2vec - INFO - EPOCH 13 - PROGRESS: at 87.74% examples, 438226 words/s, in_qsize 16, out_qsize 1
2020-04-10 14:04:36,054 - gensim.models.base_any2vec - INFO - EPOCH 13 - PROGRESS: at 92.05% examples, 438105 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:04:37,059 - gensim.models.base_any2vec - INFO - EPOCH 13 - PROGRESS: at 96.53% examples, 438408 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:04:37,625 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 7 more threads
2020-04-10 14:04:37,667 - gensim.models.base_any2vec - INFO - worker thread finished; awaiting finish of 6 more threads
2020-04-10 14:04:37,692 - gensim.models.

2020-04-10 14:05:16,937 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 70.15% examples, 440101 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:05:17,947 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 74.76% examples, 440797 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:05:18,994 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 79.29% examples, 441549 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:05:20,015 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 83.83% examples, 442330 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:05:21,023 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 88.40% examples, 443291 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:05:22,030 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 93.02% examples, 444648 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:05:23,068 - gensim.models.base_any2vec - INFO - EPOCH 15 - PROGRESS: at 97.86% examples, 444848 words/s, in_qsize 15, out_

2020-04-10 14:06:00,874 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 68.78% examples, 456300 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:01,877 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 73.23% examples, 455705 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:02,893 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 77.89% examples, 456353 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:03,924 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 82.33% examples, 456130 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:04,942 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 87.10% examples, 456669 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:05,947 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 91.42% examples, 456096 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:06:06,995 - gensim.models.base_any2vec - INFO - EPOCH 17 - PROGRESS: at 96.23% examples, 456382 words/s, in_qsize 14, out_

2020-04-10 14:06:45,000 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 68.58% examples, 455353 words/s, in_qsize 15, out_qsize 2
2020-04-10 14:06:46,002 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 73.23% examples, 455974 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:06:47,007 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 77.80% examples, 456340 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:48,013 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 82.24% examples, 456732 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:06:49,051 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 86.90% examples, 456283 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:06:50,074 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 91.52% examples, 456688 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:06:51,111 - gensim.models.base_any2vec - INFO - EPOCH 19 - PROGRESS: at 96.22% examples, 456737 words/s, in_qsize 14, out_

2020-04-10 14:07:29,160 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 68.27% examples, 457233 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:07:30,197 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 72.71% examples, 455623 words/s, in_qsize 13, out_qsize 2
2020-04-10 14:07:31,207 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 77.53% examples, 456985 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:07:32,212 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 81.86% examples, 456871 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:07:33,219 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 86.53% examples, 457141 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:07:34,237 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 91.07% examples, 457175 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:07:35,239 - gensim.models.base_any2vec - INFO - EPOCH 21 - PROGRESS: at 95.52% examples, 457083 words/s, in_qsize 15, out_

2020-04-10 14:08:13,532 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 68.68% examples, 455457 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:14,556 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 73.43% examples, 456010 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:15,582 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 78.05% examples, 456398 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:16,584 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 82.33% examples, 455855 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:08:17,608 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 87.19% examples, 456721 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:18,620 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 91.62% examples, 456448 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:19,628 - gensim.models.base_any2vec - INFO - EPOCH 23 - PROGRESS: at 96.23% examples, 456692 words/s, in_qsize 15, out_

2020-04-10 14:08:57,642 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 68.18% examples, 455981 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:58,666 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 72.83% examples, 455937 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:08:59,676 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 77.44% examples, 456183 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:09:00,724 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 81.85% examples, 455539 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:09:01,746 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 86.62% examples, 456017 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:09:02,765 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 91.15% examples, 456082 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:09:03,770 - gensim.models.base_any2vec - INFO - EPOCH 25 - PROGRESS: at 95.81% examples, 456850 words/s, in_qsize 16, out_

2020-04-10 14:09:41,736 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 68.68% examples, 455995 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:09:42,747 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 73.23% examples, 455759 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:09:43,767 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 77.79% examples, 455750 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:09:44,798 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 82.24% examples, 455559 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:09:45,821 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 86.81% examples, 455075 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:09:46,866 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 91.42% examples, 455032 words/s, in_qsize 16, out_qsize 2
2020-04-10 14:09:47,886 - gensim.models.base_any2vec - INFO - EPOCH 27 - PROGRESS: at 96.03% examples, 455115 words/s, in_qsize 16, out_

2020-04-10 14:10:25,868 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 68.37% examples, 456670 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:10:26,892 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 73.04% examples, 456578 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:10:27,907 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 77.70% examples, 457213 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:10:28,928 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 82.05% examples, 456683 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:10:29,972 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 86.89% examples, 457047 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:10:31,002 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 91.51% examples, 457265 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:10:32,005 - gensim.models.base_any2vec - INFO - EPOCH 29 - PROGRESS: at 96.11% examples, 457579 words/s, in_qsize 16, out_

2020-04-10 14:11:09,749 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 69.55% examples, 464639 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:10,755 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 74.21% examples, 464566 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:11,756 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 78.85% examples, 465682 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:12,764 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 83.09% examples, 463970 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:13,794 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 88.00% examples, 464747 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:14,802 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 92.50% examples, 464613 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:15,836 - gensim.models.base_any2vec - INFO - EPOCH 31 - PROGRESS: at 97.33% examples, 464383 words/s, in_qsize 15, out_

2020-04-10 14:11:53,387 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 69.90% examples, 460665 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:54,414 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 74.68% examples, 460765 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:55,423 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 79.03% examples, 460287 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:11:56,438 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 83.74% examples, 461229 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:57,476 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 88.34% examples, 460559 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:11:58,478 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 92.93% examples, 461209 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:11:59,483 - gensim.models.base_any2vec - INFO - EPOCH 33 - PROGRESS: at 97.76% examples, 461307 words/s, in_qsize 15, out_

2020-04-10 14:12:36,692 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 69.37% examples, 462701 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:12:37,715 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 74.12% examples, 462840 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:12:38,727 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 78.66% examples, 463201 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:12:39,733 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 83.27% examples, 463710 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:12:40,768 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 87.92% examples, 462954 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:12:41,769 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 92.68% examples, 464463 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:12:42,771 - gensim.models.base_any2vec - INFO - EPOCH 35 - PROGRESS: at 97.55% examples, 464925 words/s, in_qsize 15, out_

2020-04-10 14:13:20,201 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 69.82% examples, 461815 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:13:21,233 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 74.68% examples, 462287 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:13:22,259 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 79.21% examples, 462303 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:13:23,277 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 83.83% examples, 462561 words/s, in_qsize 16, out_qsize 1
2020-04-10 14:13:24,328 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 88.77% examples, 463396 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:13:25,354 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 93.46% examples, 463843 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:13:26,372 - gensim.models.base_any2vec - INFO - EPOCH 37 - PROGRESS: at 98.66% examples, 464830 words/s, in_qsize 14, out_

2020-04-10 14:14:03,660 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 69.73% examples, 463967 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:04,690 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 74.48% examples, 463788 words/s, in_qsize 16, out_qsize 2
2020-04-10 14:14:05,691 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 79.21% examples, 465472 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:06,700 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 83.83% examples, 465790 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:14:07,702 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 88.25% examples, 464751 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:14:08,711 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 92.77% examples, 464577 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:09,714 - gensim.models.base_any2vec - INFO - EPOCH 39 - PROGRESS: at 97.76% examples, 465470 words/s, in_qsize 15, out_

2020-04-10 14:14:47,011 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 70.23% examples, 468169 words/s, in_qsize 13, out_qsize 2
2020-04-10 14:14:48,044 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 75.23% examples, 468747 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:49,045 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 79.63% examples, 468563 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:50,048 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 84.11% examples, 467848 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:14:51,051 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 88.86% examples, 468623 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:52,105 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 93.46% examples, 467678 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:14:53,121 - gensim.models.base_any2vec - INFO - EPOCH 41 - PROGRESS: at 98.66% examples, 468543 words/s, in_qsize 14, out_

2020-04-10 14:15:30,571 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 70.15% examples, 464720 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:15:31,590 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 74.94% examples, 464785 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:15:32,605 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 79.63% examples, 466052 words/s, in_qsize 16, out_qsize 0
2020-04-10 14:15:33,616 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 84.19% examples, 465770 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:15:34,625 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 88.77% examples, 465540 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:15:35,640 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 93.30% examples, 465210 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:15:36,645 - gensim.models.base_any2vec - INFO - EPOCH 43 - PROGRESS: at 98.25% examples, 465589 words/s, in_qsize 15, out_

2020-04-10 14:16:13,950 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 69.90% examples, 463233 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:16:14,956 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 74.58% examples, 463217 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:16:15,963 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 79.21% examples, 464236 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:16:17,001 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 83.92% examples, 464390 words/s, in_qsize 16, out_qsize 1
2020-04-10 14:16:18,001 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 88.77% examples, 465866 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:16:19,007 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 93.30% examples, 465714 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:16:20,018 - gensim.models.base_any2vec - INFO - EPOCH 45 - PROGRESS: at 98.25% examples, 465936 words/s, in_qsize 16, out_

2020-04-10 14:16:57,439 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 69.74% examples, 462544 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:16:58,464 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 74.22% examples, 460908 words/s, in_qsize 16, out_qsize 2
2020-04-10 14:16:59,494 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 78.67% examples, 460355 words/s, in_qsize 16, out_qsize 1
2020-04-10 14:17:00,508 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 83.47% examples, 461823 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:17:01,515 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 88.16% examples, 462318 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:17:02,529 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 92.66% examples, 462154 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:17:03,541 - gensim.models.base_any2vec - INFO - EPOCH 47 - PROGRESS: at 97.66% examples, 462959 words/s, in_qsize 16, out_

2020-04-10 14:17:41,038 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 69.73% examples, 462998 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:17:42,060 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 74.58% examples, 463699 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:17:43,110 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 79.03% examples, 462458 words/s, in_qsize 14, out_qsize 1
2020-04-10 14:17:44,159 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 83.74% examples, 462438 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:17:45,190 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 88.40% examples, 462328 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:17:46,199 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 93.02% examples, 462752 words/s, in_qsize 15, out_qsize 0
2020-04-10 14:17:47,228 - gensim.models.base_any2vec - INFO - EPOCH 49 - PROGRESS: at 97.96% examples, 462696 words/s, in_qsize 14, out_

In [9]:
model.save('./data/04_features/word2vec_model.bin')

2020-04-10 14:18:09,472 - gensim.utils - INFO - saving Word2Vec object under ./data/04_features/word2vec_model.bin, separately None
2020-04-10 14:18:09,473 - gensim.utils - INFO - not storing attribute vectors_norm
2020-04-10 14:18:09,608 - gensim.utils - INFO - not storing attribute cum_table
2020-04-10 14:18:10,673 - gensim.utils - INFO - saved ./data/04_features/word2vec_model.bin


## Get embeddings

In [5]:
def get_embeddings(model, vocab):
    # Check embeddings
    print('Vocab length:', len(model.wv.vocab))
    print('Embedding size:', model.vector_size)
    print('Most similar to "war" in embedding space:', model.most_similar('war'))
    war, cold = model['war'].reshape((1, -1)), model['cold'].reshape((1, -1))
    print('Cosine distance between "cold" and "war" in embedding space (gensim metric):', model.similarity('cold', 'war'))
    print('Cosine distance between "cold" and "war" in embedding space (sklearn metric):', cosine_similarity(cold, war))

    words = list(vocab.token2id)
    words = [w for w in words if w in model.wv.vocab]

    embeddings = np.array([model[w] for w in words])
    embeddings_norm = preprocessing.normalize(embeddings)

    dict_embeddings = {w:emb for w,emb in zip(words, embeddings)}
    dict_embeddings_norm = {w:emb for w,emb in zip(words, embeddings_norm)}

    return dict_embeddings, dict_embeddings_norm

In [6]:
dict_embeddings, dict_embeddings_norm = get_embeddings(model, vocab)

Vocab length: 4895
Embedding size: 300
2020-04-10 12:37:14,398 - gensim.models.keyedvectors - INFO - precomputing L2-norms of word weight vectors
Most similar to "war" in embedding space: [('world_war', 0.4436487555503845), ('gulf', 0.4100189208984375), ('massacre', 0.4027578830718994), ('serb', 0.36117807030677795), ('britain', 0.34083133935928345), ('bosnian', 0.33297497034072876), ('ottoman', 0.3318420648574829), ('jew', 0.32354503870010376), ('arab', 0.3178195655345917), ('muslim', 0.31602174043655396)]
Cosine distance between "cold" and "war" in embedding space (gensim metric): 0.12926339
Cosine distance between "cold" and "war" in embedding space (sklearn metric): [[0.12926337]]


  """
  
  import sys
  del sys.path[0]


In [9]:
import pickle
with open('./data/04_features/dict_embeddings.pkl', 'wb') as handle:
    pickle.dump(dict_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./data/04_features/dict_embeddings_norm.pkl', 'wb') as handle:
    pickle.dump(dict_embeddings_norm, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Eval / Visualize word embeddings

In [1]:
words = list(vocab.token2id)
words = [w for w in words if w in w2v.wv.vocab]
embeddings = np.array([model[w] for w in words])
embeddings_norm = preprocessing.normalize(embeddings)

NameError: name 'vocab' is not defined

In [None]:
from sklearn.manifold import TSNE

#X_embedded = TSNE(n_components=2, metric='cosine').fit_transform(embeddings)
X_embedded = TSNE(n_components=2).fit_transform(embeddings_norm)
#Y_embedded = model.predict(embeddings_norm)

In [None]:
plt.figure(figsize=(20, 20))
plt.scatter(X_embedded[:,0], X_embedded[:,1], s=20)
plt.show()