# Word2Vec

In [57]:
import pickle
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np
import pandas as pd
import time
import gzip
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [60]:
import operator

class MyDocuments(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        with gzip.open(self.dirname, 'rb') as f:
            for line in f:
                yield line.decode().split('\t')[1].split()
                
                
x = MyDocuments('data/eos/ngram/bigram_transformed_docs_%s.gz' % 'all')
for sentence_no, sentence in enumerate(x):
    print(sentence_no)
    print(sentence)
    break

0
['economics', 'turkey', 'akp', 'aim', 'create', 'rich', 'class', 'close', '2007', 'prominent', 'scholar', 'say', 'ezgi_başaran', 'istanbul', 'print_page', 'send', 'friend_tweetle', 'share', 'facebook_şevket', 'pamuk_previously', 'professor', 'economic', 'history', 'london', 'school', 'economics', 'currently', 'teach', 'boğaziçi_university', 'istanbul', 'şevket_pamuk', 'previously', 'professor', 'economic', 'history', 'london', 'school', 'economics', 'currently', 'teach', 'boğaziçi_university', 'istanbul', 'ruling_justice', 'development', 'party_akp', 'aim', 'create', 'rich', 'class', 'close', 'government', '2007', 'expense', 'rational', 'long_term', 'economic', 'decision_making', 'renowned', 'economic', 'historian', 'professor_şevket', 'pamuk', 'say', '2007', 'economic', 'priority', 'give', 'race', 'authority', 'need', 'create', 'rich', 'group', 'support', 'government', 'akp', 'thus', 'provide', 'vote', 'always', 'come', 'first', 'become', 'rational', 'think', 'election', 'two', 'yea

In [61]:
%%time


def generate_w2v(corpus_path, word2vec_model_file):
    
    print("starting epoche " + time.strftime("%H:%M:%S"))
    # initiate the model and perform the first epoch of training
    word2vec_model = Word2Vec(MyDocuments(corpus_path), size=100, 
                              window=5, iter=5,
                              min_count=10, sg=1, workers=7)
    word2vec_model.save(word2vec_model_file)
    print("Finished epoche " + time.strftime("%H:%M:%S"))
        
    print ("{} training epochs so far".format(word2vec_model.train_count))
    print ("{:,} terms in the word2vec EOS vocabulary.".format(len(word2vec_model.wv.vocab)))

def load_w2v(word2vec_model_file):
    # load the finished model from disk
    word2vec_model = Word2Vec.load(word2vec_model_file)
    word2vec_model.init_sims(replace=True)
    return word2vec_model

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


In [62]:
%%time

# Load EOS processed corpus
corpus_path = 'data/eos/ngram/bigram_transformed_docs_%s.gz' % 'all'
print(corpus_path)

word2vec_model_file = 'data/eos/word2vec_model_all.model'

generate_w2v(corpus_path, word2vec_model_file)

2017-06-29 18:35:41,732 : INFO : collecting all words and their counts
2017-06-29 18:35:41,734 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


data/eos/ngram/bigram_transformed_docs_all.gz
starting epoche 18:35:41


2017-06-29 18:35:42,578 : INFO : PROGRESS: at sentence #10000, processed 3121030 words, keeping 181409 word types
2017-06-29 18:35:43,399 : INFO : PROGRESS: at sentence #20000, processed 6246338 words, keeping 268855 word types
2017-06-29 18:35:44,260 : INFO : PROGRESS: at sentence #30000, processed 9444411 words, keeping 394971 word types
2017-06-29 18:35:45,118 : INFO : PROGRESS: at sentence #40000, processed 12668431 words, keeping 427671 word types
2017-06-29 18:35:45,935 : INFO : PROGRESS: at sentence #50000, processed 15833280 words, keeping 429247 word types
2017-06-29 18:35:46,730 : INFO : PROGRESS: at sentence #60000, processed 18929702 words, keeping 429877 word types
2017-06-29 18:35:47,606 : INFO : PROGRESS: at sentence #70000, processed 22160773 words, keeping 484261 word types
2017-06-29 18:35:48,510 : INFO : PROGRESS: at sentence #80000, processed 25390138 words, keeping 544776 word types
2017-06-29 18:35:49,335 : INFO : PROGRESS: at sentence #90000, processed 28427432 w

2017-06-29 18:36:44,815 : INFO : PROGRESS: at 1.84% examples, 683407 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:36:45,818 : INFO : PROGRESS: at 1.94% examples, 684225 words/s, in_qsize 10, out_qsize 0
2017-06-29 18:36:46,821 : INFO : PROGRESS: at 2.04% examples, 684083 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:36:47,838 : INFO : PROGRESS: at 2.14% examples, 683880 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:36:48,845 : INFO : PROGRESS: at 2.24% examples, 684979 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:36:49,850 : INFO : PROGRESS: at 2.34% examples, 685436 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:36:50,863 : INFO : PROGRESS: at 2.44% examples, 685365 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:36:51,873 : INFO : PROGRESS: at 2.54% examples, 686098 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:36:52,895 : INFO : PROGRESS: at 2.64% examples, 685454 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:36:53,903 : INFO : PROGRESS: at 2.74% examples, 686145 word

2017-06-29 18:38:05,873 : INFO : PROGRESS: at 9.87% examples, 687767 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:38:06,883 : INFO : PROGRESS: at 9.97% examples, 687810 words/s, in_qsize 10, out_qsize 0
2017-06-29 18:38:07,901 : INFO : PROGRESS: at 10.06% examples, 687591 words/s, in_qsize 14, out_qsize 0
2017-06-29 18:38:08,902 : INFO : PROGRESS: at 10.16% examples, 687752 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:38:09,904 : INFO : PROGRESS: at 10.25% examples, 687900 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:38:10,917 : INFO : PROGRESS: at 10.35% examples, 688031 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:38:11,924 : INFO : PROGRESS: at 10.45% examples, 687954 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:38:12,939 : INFO : PROGRESS: at 10.55% examples, 688022 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:38:13,939 : INFO : PROGRESS: at 10.65% examples, 688188 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:38:14,944 : INFO : PROGRESS: at 10.75% examples, 688

2017-06-29 18:39:25,827 : INFO : PROGRESS: at 18.86% examples, 690777 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:39:26,838 : INFO : PROGRESS: at 18.96% examples, 690773 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:39:27,851 : INFO : PROGRESS: at 19.06% examples, 690815 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:39:28,851 : INFO : PROGRESS: at 19.16% examples, 690957 words/s, in_qsize 10, out_qsize 0
2017-06-29 18:39:29,858 : INFO : PROGRESS: at 19.25% examples, 690868 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:39:30,873 : INFO : PROGRESS: at 19.35% examples, 690964 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:39:31,888 : INFO : PROGRESS: at 19.45% examples, 690926 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:39:32,890 : INFO : PROGRESS: at 19.54% examples, 691061 words/s, in_qsize 12, out_qsize 1
2017-06-29 18:39:33,905 : INFO : PROGRESS: at 19.64% examples, 691143 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:39:34,908 : INFO : PROGRESS: at 19.74% examples, 6

2017-06-29 18:40:45,785 : INFO : PROGRESS: at 26.87% examples, 694008 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:40:46,802 : INFO : PROGRESS: at 26.96% examples, 694005 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:40:47,822 : INFO : PROGRESS: at 27.06% examples, 693957 words/s, in_qsize 12, out_qsize 1
2017-06-29 18:40:48,832 : INFO : PROGRESS: at 27.16% examples, 694011 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:40:49,843 : INFO : PROGRESS: at 27.27% examples, 693989 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:40:50,855 : INFO : PROGRESS: at 27.36% examples, 693993 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:40:51,863 : INFO : PROGRESS: at 27.46% examples, 694030 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:40:52,881 : INFO : PROGRESS: at 27.56% examples, 694035 words/s, in_qsize 12, out_qsize 1
2017-06-29 18:40:53,886 : INFO : PROGRESS: at 27.66% examples, 694059 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:40:54,893 : INFO : PROGRESS: at 27.76% examples, 6

2017-06-29 18:42:05,722 : INFO : PROGRESS: at 35.78% examples, 694926 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:42:06,737 : INFO : PROGRESS: at 35.88% examples, 694918 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:42:07,749 : INFO : PROGRESS: at 36.00% examples, 694942 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:42:08,754 : INFO : PROGRESS: at 36.11% examples, 695019 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:42:09,756 : INFO : PROGRESS: at 36.21% examples, 694979 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:42:10,763 : INFO : PROGRESS: at 36.32% examples, 694991 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:42:11,766 : INFO : PROGRESS: at 36.43% examples, 695039 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:42:12,774 : INFO : PROGRESS: at 36.54% examples, 695036 words/s, in_qsize 11, out_qsize 1
2017-06-29 18:42:13,779 : INFO : PROGRESS: at 36.64% examples, 695119 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:42:14,781 : INFO : PROGRESS: at 36.75% examples, 6

2017-06-29 18:43:25,425 : INFO : PROGRESS: at 43.83% examples, 696931 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:43:26,443 : INFO : PROGRESS: at 43.94% examples, 696915 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:43:27,474 : INFO : PROGRESS: at 44.05% examples, 696931 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:43:28,480 : INFO : PROGRESS: at 44.16% examples, 696959 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:43:29,492 : INFO : PROGRESS: at 44.26% examples, 696965 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:43:30,504 : INFO : PROGRESS: at 44.37% examples, 696885 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:43:31,509 : INFO : PROGRESS: at 44.47% examples, 696878 words/s, in_qsize 11, out_qsize 2
2017-06-29 18:43:32,526 : INFO : PROGRESS: at 44.59% examples, 696877 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:43:33,542 : INFO : PROGRESS: at 44.69% examples, 696851 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:43:34,549 : INFO : PROGRESS: at 44.80% examples, 6

2017-06-29 18:44:45,322 : INFO : PROGRESS: at 52.15% examples, 697815 words/s, in_qsize 12, out_qsize 1
2017-06-29 18:44:46,335 : INFO : PROGRESS: at 52.28% examples, 697829 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:44:47,350 : INFO : PROGRESS: at 52.41% examples, 697805 words/s, in_qsize 12, out_qsize 1
2017-06-29 18:44:48,357 : INFO : PROGRESS: at 52.54% examples, 697821 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:44:49,362 : INFO : PROGRESS: at 52.67% examples, 697824 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:44:50,365 : INFO : PROGRESS: at 52.80% examples, 697859 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:44:51,372 : INFO : PROGRESS: at 52.92% examples, 697840 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:44:52,375 : INFO : PROGRESS: at 53.05% examples, 697854 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:44:53,389 : INFO : PROGRESS: at 53.18% examples, 697849 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:44:54,393 : INFO : PROGRESS: at 53.31% examples, 6

2017-06-29 18:46:05,195 : INFO : PROGRESS: at 60.97% examples, 698766 words/s, in_qsize 14, out_qsize 0
2017-06-29 18:46:06,200 : INFO : PROGRESS: at 61.07% examples, 698828 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:46:07,218 : INFO : PROGRESS: at 61.17% examples, 698857 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:46:08,230 : INFO : PROGRESS: at 61.27% examples, 698890 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:46:09,235 : INFO : PROGRESS: at 61.36% examples, 698928 words/s, in_qsize 10, out_qsize 0
2017-06-29 18:46:10,266 : INFO : PROGRESS: at 61.46% examples, 698942 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:46:11,274 : INFO : PROGRESS: at 61.56% examples, 698975 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:46:12,301 : INFO : PROGRESS: at 61.67% examples, 699011 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:46:13,307 : INFO : PROGRESS: at 61.77% examples, 699071 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:46:14,312 : INFO : PROGRESS: at 61.87% examples, 6

2017-06-29 18:47:24,954 : INFO : PROGRESS: at 69.06% examples, 699950 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:47:25,978 : INFO : PROGRESS: at 69.15% examples, 699935 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:47:26,985 : INFO : PROGRESS: at 69.25% examples, 699949 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:47:27,987 : INFO : PROGRESS: at 69.36% examples, 699953 words/s, in_qsize 11, out_qsize 1
2017-06-29 18:47:28,996 : INFO : PROGRESS: at 69.46% examples, 699975 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:47:30,002 : INFO : PROGRESS: at 69.55% examples, 699903 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:47:31,007 : INFO : PROGRESS: at 69.66% examples, 699937 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:47:32,029 : INFO : PROGRESS: at 69.76% examples, 699908 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:47:33,036 : INFO : PROGRESS: at 69.86% examples, 699927 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:47:34,037 : INFO : PROGRESS: at 69.96% examples, 6

2017-06-29 18:48:44,767 : INFO : PROGRESS: at 78.14% examples, 700331 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:48:45,776 : INFO : PROGRESS: at 78.24% examples, 700334 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:48:46,798 : INFO : PROGRESS: at 78.34% examples, 700344 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:48:47,801 : INFO : PROGRESS: at 78.45% examples, 700370 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:48:48,809 : INFO : PROGRESS: at 78.55% examples, 700367 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:48:49,822 : INFO : PROGRESS: at 78.65% examples, 700379 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:48:50,836 : INFO : PROGRESS: at 78.75% examples, 700383 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:48:51,848 : INFO : PROGRESS: at 78.85% examples, 700418 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:48:52,851 : INFO : PROGRESS: at 78.95% examples, 700413 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:48:53,863 : INFO : PROGRESS: at 79.05% examples, 7

2017-06-29 18:50:04,499 : INFO : PROGRESS: at 86.24% examples, 701471 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:50:05,503 : INFO : PROGRESS: at 86.34% examples, 701485 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:50:06,503 : INFO : PROGRESS: at 86.44% examples, 701493 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:50:07,516 : INFO : PROGRESS: at 86.54% examples, 701484 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:50:08,521 : INFO : PROGRESS: at 86.64% examples, 701483 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:50:09,529 : INFO : PROGRESS: at 86.74% examples, 701476 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:50:10,533 : INFO : PROGRESS: at 86.84% examples, 701473 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:50:11,557 : INFO : PROGRESS: at 86.95% examples, 701492 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:50:12,568 : INFO : PROGRESS: at 87.04% examples, 701487 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:50:13,581 : INFO : PROGRESS: at 87.15% examples, 7

2017-06-29 18:51:24,306 : INFO : PROGRESS: at 95.15% examples, 701694 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:51:25,319 : INFO : PROGRESS: at 95.27% examples, 701683 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:51:26,324 : INFO : PROGRESS: at 95.38% examples, 701696 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:51:27,359 : INFO : PROGRESS: at 95.49% examples, 701692 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:51:28,368 : INFO : PROGRESS: at 95.60% examples, 701688 words/s, in_qsize 11, out_qsize 0
2017-06-29 18:51:29,372 : INFO : PROGRESS: at 95.71% examples, 701691 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:51:30,382 : INFO : PROGRESS: at 95.81% examples, 701667 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:51:31,399 : INFO : PROGRESS: at 95.93% examples, 701676 words/s, in_qsize 13, out_qsize 0
2017-06-29 18:51:32,412 : INFO : PROGRESS: at 96.04% examples, 701678 words/s, in_qsize 12, out_qsize 0
2017-06-29 18:51:33,418 : INFO : PROGRESS: at 96.14% examples, 7

Finished epoche 18:52:12
1 training epochs so far
440,124 terms in the word2vec EOS vocabulary.
CPU times: user 1h 48min 19s, sys: 7.92 s, total: 1h 48min 27s
Wall time: 16min 31s


In [63]:
%%time

word2vec_model = load_w2v(word2vec_model_file)

2017-06-29 22:01:43,923 : INFO : loading Word2Vec object from data/eos/word2vec_model_all.model
2017-06-29 22:01:44,782 : INFO : loading wv recursively from data/eos/word2vec_model_all.model.wv.* with mmap=None
2017-06-29 22:01:44,784 : INFO : loading syn0 from data/eos/word2vec_model_all.model.wv.syn0.npy with mmap=None
2017-06-29 22:01:44,835 : INFO : setting ignored attribute syn0norm to None
2017-06-29 22:01:44,838 : INFO : loading syn1neg from data/eos/word2vec_model_all.model.syn1neg.npy with mmap=None
2017-06-29 22:01:44,879 : INFO : setting ignored attribute cum_table to None
2017-06-29 22:01:44,880 : INFO : loaded data/eos/word2vec_model_all.model
2017-06-29 22:01:45,658 : INFO : precomputing L2-norms of word weight vectors


CPU times: user 3.38 s, sys: 120 ms, total: 3.5 s
Wall time: 3.49 s


In [64]:

def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in word2vec_model.most_similar(positive=[token], topn=topn):

        print ("{:20} {}".format(word, round(similarity, 3)))

In [75]:
get_related_terms(u'economic')

bilateral_interaction 0.734
b315bffa80685f5c     0.725
economy              0.719
abu_dhabidepartment  0.716
topilin              0.705
socio_economic       0.698
problem_interrelate  0.696
dawei_special        0.694
massively_deepen     0.694
latvian_presidency   0.692


In [66]:
get_related_terms(u'terror')

terrorist            0.802
via_facebook,[186    0.743
moi_karzakan         0.74
joe_pozell           0.733
terrorism’           0.73
ibda                 0.708
terrorist_organization 0.703
behave_unjust        0.7
thus_neutralize      0.699
milita               0.698


In [67]:
get_related_terms(u'baghdad')

iraqi                0.788
erbil                0.75
tikrit               0.748
anbar_province       0.731
arbil                0.712
anbar                0.711
kurds                0.71
kurdish              0.71
irbil                0.707
shi'ite_militia      0.706


In [68]:
get_related_terms(u'government')

authority            0.707
spokesman_bülent     0.661
twitter_@gregatthetrib 0.646
sharon_chepchirchir  0.646
unremitting          0.643
kant_celebration     0.64
president_abdrabbo   0.638
kompass              0.634
kurtulmuş_justice    0.633
governments’         0.627


In [69]:
get_related_terms(u'maliki')

nouri_al             0.772
nakedly_sectarian    0.7
dispassionate_observer 0.699
favor_shiites        0.697
yam_matir            0.684
secessionist_aspiration 0.675
baathists            0.663
pregent              0.663
nuri_al              0.66
sectarian_tone       0.659


In [70]:
get_related_terms(u'russia')

russian              0.822
moscow               0.803
james_slavyanski     0.788
ukraine              0.777
moldova_georgia      0.764
lukashenko           0.757
belarussian          0.754
crimea               0.743
putin                0.742
rossiiskaya_gazeta   0.738


In [71]:
get_related_terms(u'health')

mohs                 0.758
marleen_temmerman    0.751
central_labcentral   0.744
care_corporationprimary 0.738
dustin_duncan        0.736
saleha_complementary 0.726
joins_allegheny      0.726
adolescent_reproductive 0.725
survey_2008/09       0.72
dosimetrist          0.72


In [72]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = word2vec_model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print (term)

In [73]:
word_algebra(add=[u'sanction', u'syria'], topn=5)

curbing
sanction_impose
vladimir_chizhov
young_tajiks
russia


In [74]:
word_algebra(add=[u'economy', u'syria'], subtract=[u'asad'], topn=5)

economic
reckless_borrowing
recession
discontent_domestically
economic_growth


# Visualization TSNE

In [45]:
from sklearn.manifold import TSNE

In [46]:

# build a list of the terms, integer indices,
# and term counts from the word2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in word2vec_model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda tup: -tup[2])


# # unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# # create a DataFrame with the food2vec vectors as data,
# # and the terms as row labels
word_vectors = pd.DataFrame(word2vec_model.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
say,-0.007175,0.080072,0.187092,0.055488,0.072249,0.112369,-0.107836,0.025000,0.024303,-0.082460,...,0.062865,-0.056757,0.230199,-0.031412,0.044573,0.007744,-0.136562,0.040573,0.050124,0.042578
istanbul,-0.036135,-0.015310,0.012111,0.015122,-0.180252,0.021894,-0.136873,-0.024661,0.058580,0.017153,...,0.182467,0.024780,-0.060922,-0.111213,-0.213065,0.114228,-0.037462,0.097068,0.024201,-0.113160
turkey,-0.127895,-0.050788,0.027706,0.013913,-0.095069,0.160910,-0.187430,0.106380,0.148729,0.015136,...,0.110936,-0.003229,0.051678,0.027611,-0.023469,0.156915,-0.261858,-0.079094,-0.203174,0.066948
will,0.026403,0.027060,0.144830,-0.107578,0.230813,-0.003298,-0.129703,0.029644,0.103363,-0.141433,...,0.006640,0.081921,0.105053,-0.017269,-0.124522,0.011551,0.012229,-0.087602,-0.247321,0.042051
year,0.035458,0.050516,0.003135,-0.016043,0.009039,0.092902,-0.217881,0.175122,0.065804,0.135953,...,-0.003752,-0.058563,0.076364,-0.081817,-0.034305,0.090204,-0.178993,-0.109481,-0.125844,-0.151508
turkish,0.049734,-0.085024,0.077718,0.060094,-0.141939,-0.011339,-0.177154,0.130250,0.001267,-0.054835,...,-0.013175,0.109169,0.064726,0.029496,0.082840,0.146370,-0.238414,0.068515,-0.092492,-0.134425
country,-0.172656,0.012565,0.096140,0.053815,0.065204,0.061690,-0.055476,-0.091359,0.053572,0.032707,...,0.035081,0.212101,0.076298,-0.029617,-0.074905,0.235771,-0.187836,-0.044306,0.002797,0.021783
also,0.047957,0.077703,0.071980,0.029916,0.096666,0.091557,-0.205554,-0.027032,0.047752,-0.031062,...,0.120670,-0.098722,0.163418,0.063945,0.104717,0.193466,-0.004166,0.165809,0.002308,-0.031382
report,-0.104857,-0.016275,-0.003608,-0.017703,-0.012306,0.118620,-0.117260,0.130901,0.242188,0.102539,...,0.060643,0.036640,0.058216,-0.055098,0.177434,-0.162351,-0.064648,0.004424,0.016056,0.135426
one,0.151683,0.058006,-0.066712,-0.003021,-0.066153,0.086485,-0.095669,0.087823,-0.015020,-0.040060,...,-0.011489,0.059597,0.096439,0.071556,-0.166881,-0.037777,-0.226817,-0.043788,0.003687,-0.103469


In [47]:
# tsne_input = word_vectors.drop(spacy.en.English.Defaults.stop_words, errors=u'ignore')
tsne_input = word_vectors.head(5000)


In [48]:
tsne_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
say,-0.007175,0.080072,0.187092,0.055488,0.072249,0.112369,-0.107836,0.025,0.024303,-0.08246,...,0.062865,-0.056757,0.230199,-0.031412,0.044573,0.007744,-0.136562,0.040573,0.050124,0.042578
istanbul,-0.036135,-0.01531,0.012111,0.015122,-0.180252,0.021894,-0.136873,-0.024661,0.05858,0.017153,...,0.182467,0.02478,-0.060922,-0.111213,-0.213065,0.114228,-0.037462,0.097068,0.024201,-0.11316
turkey,-0.127895,-0.050788,0.027706,0.013913,-0.095069,0.16091,-0.18743,0.10638,0.148729,0.015136,...,0.110936,-0.003229,0.051678,0.027611,-0.023469,0.156915,-0.261858,-0.079094,-0.203174,0.066948
will,0.026403,0.02706,0.14483,-0.107578,0.230813,-0.003298,-0.129703,0.029644,0.103363,-0.141433,...,0.00664,0.081921,0.105053,-0.017269,-0.124522,0.011551,0.012229,-0.087602,-0.247321,0.042051
year,0.035458,0.050516,0.003135,-0.016043,0.009039,0.092902,-0.217881,0.175122,0.065804,0.135953,...,-0.003752,-0.058563,0.076364,-0.081817,-0.034305,0.090204,-0.178993,-0.109481,-0.125844,-0.151508


In [49]:
tsne_filepath = 'data/eos/tsne/tsne_model'
tsne_vectors_filepath = 'data/eos/tsne/tsne_vectors.npy'

In [50]:
%%time


if 1 == 1:
    
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne, f)
        
    print('done...')

    pd.np.save(tsne_vectors_filepath, tsne_vectors)
    

# with open(tsne_filepath) as f:
#     tsne = pickle.load(f)
    
# tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

done...
CPU times: user 51.2 s, sys: 3.04 s, total: 54.2 s
Wall time: 53.6 s


In [51]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
say,0.790907,2.216428
istanbul,1.65724,9.019597
turkey,-5.158473,7.759974
will,-5.248469,-3.632023
year,-5.759462,6.261718


In [52]:
tsne_vectors[u'word'] = tsne_vectors.index

In [53]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [54]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [50]:
# Another view
# X = word2vec_model[word2vec_model.wv.vocab]

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
# plt.show()