# Word2Vec

In [14]:
import pickle
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import time
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Load EOS processed corpus
doc_filepath = 'data/eos/eos_tokenize_all.p'
eos_corpus = pickle.load( open( doc_filepath, "rb" ) )

In [18]:
%%time

word2vec_model_file = 'data/eos/word2vec_model'

if 0 == 1:
    
    
    print("starting epoche 1" + time.strftime("%c"))
    # initiate the model and perform the first epoch of training
    word2vec_model = Word2Vec(eos_corpus, size=100, window=5,
                              min_count=10, sg=1, workers=7)
    word2vec_model.save(word2vec_model_file)
    print("stop epoche 1" + time.strftime("%c"))
    
    # perform another 11 epochs of training
#     for i in range(1, 2):
#         print("starting epoche " + time.strftime("%c"))
#         word2vec_model.train(trigram_sentences)
#         word2vec_model.save(word2vec_model_file)
#         print("finished epoche " + time.strftime("%c"))

else:
    # load the finished model from disk
    word2vec_model = Word2Vec.load(word2vec_model_file)
    word2vec_model.init_sims(replace=True)

2017-06-05 00:19:40,038 : INFO : loading Word2Vec object from data/eos/word2vec_model
2017-06-05 00:19:42,031 : INFO : loading wv recursively from data/eos/word2vec_model.wv.* with mmap=None
2017-06-05 00:19:42,033 : INFO : loading syn0 from data/eos/word2vec_model.wv.syn0.npy with mmap=None
2017-06-05 00:19:42,069 : INFO : setting ignored attribute syn0norm to None
2017-06-05 00:19:42,070 : INFO : loading syn1neg from data/eos/word2vec_model.syn1neg.npy with mmap=None
2017-06-05 00:19:42,113 : INFO : setting ignored attribute cum_table to None
2017-06-05 00:19:42,113 : INFO : loaded data/eos/word2vec_model
2017-06-05 00:19:42,539 : INFO : precomputing L2-norms of word weight vectors


CPU times: user 3.18 s, sys: 104 ms, total: 3.28 s
Wall time: 3.27 s


In [19]:
print ("{} training epochs so far".format(word2vec_model.train_count))
print ("{:,} terms in the word2vec EOS vocabulary.".format(len(word2vec_model.wv.vocab)))


1 training epochs so far
198,139 terms in the word2vec EOS vocabulary.


In [20]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in word2vec_model.most_similar(positive=[token], topn=topn):

        print ("{:20} {}".format(word, round(similarity, 3)))

In [21]:
get_related_terms(u'economic')

economy              0.779
socio                0.72
macroeconomic        0.716
growth               0.705
dhabidepartment      0.7
escwa                0.697
newsflow             0.689
valovaya             0.687
councildubai         0.681
researchabu          0.674


In [22]:
get_related_terms(u'terror')

terrorist            0.832
terrorism            0.718
suspect              0.681
zayadah              0.68
extremist            0.675
qaeda                0.655
suspected            0.647
hildi                0.642
operative            0.63
counterintelligence  0.628


In [23]:
get_related_terms(u'baghdad')

kirkuk               0.76
erbil                0.752
tikrit               0.722
ramadi               0.705
irbil                0.705
shiite               0.703
anbar                0.699
iraqi                0.699
baiji                0.672
kurdistan            0.666


In [24]:
get_related_terms(u'government')

anbari               0.701
authority            0.692
abdrabbu             0.666
overco               0.661
houhti               0.659
demobilize           0.656
antagonising         0.654
yemini               0.654
abdrabbo             0.654
bahgdadi             0.654


In [34]:
get_related_terms(u'maliki')

nouri                0.643
matir                0.632
abadi                0.615
khedery              0.579
sahwat               0.565
barzani              0.563
mirghani             0.56
mosisili             0.557
saddam               0.551
baghdad              0.548


In [30]:
get_related_terms(u'russia')

moscow               0.749
ukraine              0.74
russian              0.717
kremlin              0.693
chizhov              0.689
crimea               0.677
belyavskiy           0.669
sanction             0.668
putin                0.659
bulbuloglu           0.659


In [26]:
get_related_terms(u'health')

mohs                 0.759
organizationworld    0.74
healthcare           0.737
nams                 0.729
temmerman            0.727
corporationprimary   0.72
organisationworld    0.715
marleen              0.713
vch                  0.713
seha                 0.708


In [27]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = word2vec_model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print (term)

In [38]:
word_algebra(add=[u'sanction', u'syria'], topn=5)

assad
nebenzya
yemini
bashar
russia


In [35]:
word_algebra(add=[u'economy', u'syria'], subtract=[u'asad'], topn=5)

recession
crisis
economic
neighbor
gdp


# Visualization TSNE

In [39]:
from sklearn.manifold import TSNE

In [40]:

# build a list of the terms, integer indices,
# and term counts from the word2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in word2vec_model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda tup: -tup[2])


# # unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# # create a DataFrame with the food2vec vectors as data,
# # and the terms as row labels
word_vectors = pd.DataFrame(word2vec_model.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
turkish,-0.078827,0.070714,0.024026,0.138423,-0.094456,-0.011151,-0.102013,-0.083826,0.071213,-0.116671,...,0.076054,-0.125058,0.157605,-0.032018,-0.125997,0.074511,0.074794,-0.157245,-0.017780,0.070153
year,-0.219870,-0.013753,0.019641,0.100880,-0.033665,0.105813,-0.074939,-0.006838,-0.171050,0.012083,...,0.141240,0.007454,0.149158,-0.013661,-0.135341,-0.082259,0.060020,-0.098443,-0.053506,0.057571
turkey,-0.006263,0.161830,-0.107294,0.060069,-0.087660,0.055797,-0.128926,0.041999,0.058590,0.063139,...,0.163418,0.070634,0.041025,-0.087431,-0.168170,0.144127,0.022171,-0.132714,-0.063018,0.086689
istanbul,-0.118134,0.134814,-0.093969,-0.099889,0.018552,0.111902,-0.078743,-0.157863,-0.047532,0.118658,...,0.073645,-0.058763,-0.027572,0.010259,-0.066042,-0.045763,0.256666,-0.025899,-0.029339,-0.091674
new,-0.099500,0.097055,0.078816,0.276175,0.048940,0.056429,0.007677,-0.052780,-0.047077,0.140533,...,-0.024441,0.035425,0.166962,0.119158,-0.117700,0.039618,0.045841,-0.105296,-0.086348,0.062850
people,-0.046724,0.067042,0.161898,0.208668,-0.205233,0.099430,-0.103537,0.085013,-0.110090,0.013468,...,0.023412,0.021725,-0.028448,0.034801,-0.206308,0.095699,0.148217,0.086840,-0.132639,0.033386
police,-0.037579,-0.142559,0.071765,0.147535,0.081322,0.192691,-0.102641,0.004516,0.019231,-0.189309,...,-0.053425,0.019515,-0.194051,-0.023045,-0.151280,0.115801,0.175470,-0.118989,-0.103323,-0.076425
also,-0.091963,0.125881,0.140217,0.194088,0.058186,-0.052110,0.137664,-0.024809,-0.004807,0.056104,...,0.141820,-0.070433,-0.076042,0.120715,-0.213564,0.015787,0.166490,-0.197439,-0.043185,0.002717
world,0.037457,0.103177,0.051663,0.097437,-0.037864,0.084888,0.090194,0.054884,0.019264,-0.086697,...,0.085249,-0.028642,0.007258,-0.035312,0.023884,-0.118658,0.192239,0.002899,-0.038640,-0.051965
first,0.153587,-0.034167,0.080341,0.080000,-0.110278,-0.003846,-0.005652,0.046344,-0.017842,0.095501,...,0.054686,-0.036023,0.236058,0.021184,-0.089929,0.078649,0.059962,-0.040731,-0.016288,-0.011572


In [42]:
# tsne_input = word_vectors.drop(spacy.en.English.Defaults.stop_words, errors=u'ignore')
tsne_input = word_vectors.head(5000)


In [43]:
tsne_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
turkish,-0.078827,0.070714,0.024026,0.138423,-0.094456,-0.011151,-0.102013,-0.083826,0.071213,-0.116671,...,0.076054,-0.125058,0.157605,-0.032018,-0.125997,0.074511,0.074794,-0.157245,-0.01778,0.070153
year,-0.21987,-0.013753,0.019641,0.10088,-0.033665,0.105813,-0.074939,-0.006838,-0.17105,0.012083,...,0.14124,0.007454,0.149158,-0.013661,-0.135341,-0.082259,0.06002,-0.098443,-0.053506,0.057571
turkey,-0.006263,0.16183,-0.107294,0.060069,-0.08766,0.055797,-0.128926,0.041999,0.05859,0.063139,...,0.163418,0.070634,0.041025,-0.087431,-0.16817,0.144127,0.022171,-0.132714,-0.063018,0.086689
istanbul,-0.118134,0.134814,-0.093969,-0.099889,0.018552,0.111902,-0.078743,-0.157863,-0.047532,0.118658,...,0.073645,-0.058763,-0.027572,0.010259,-0.066042,-0.045763,0.256666,-0.025899,-0.029339,-0.091674
new,-0.0995,0.097055,0.078816,0.276175,0.04894,0.056429,0.007677,-0.05278,-0.047077,0.140533,...,-0.024441,0.035425,0.166962,0.119158,-0.1177,0.039618,0.045841,-0.105296,-0.086348,0.06285


In [44]:
tsne_filepath = 'data/eos/tsne/tsne_model'
tsne_vectors_filepath = 'data/eos/tsne/tsne_vectors.npy'

In [45]:
%%time


if 1 == 1:
    
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne, f)
        
    print('done...')

    pd.np.save(tsne_vectors_filepath, tsne_vectors)
    

# with open(tsne_filepath) as f:
#     tsne = pickle.load(f)
    
# tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

done...
CPU times: user 48.5 s, sys: 2.88 s, total: 51.3 s
Wall time: 50.7 s


In [46]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
turkish,6.088056,-2.642254
year,-2.592765,0.92171
turkey,2.754402,-4.865135
istanbul,2.266259,-5.363695
new,7.043488,-6.207029


In [47]:
tsne_vectors[u'word'] = tsne_vectors.index

In [48]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [49]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [50]:
# Another view
# X = word2vec_model[word2vec_model.wv.vocab]

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
# plt.show()