# Word2Vec

In [77]:
import pickle
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import spacy

nlp = spacy.load('en')

In [2]:
# Load EOS processed corpus
doc_filepath = 'data/eos/eos_tokenize_all.p'
eos_corpus = pickle.load( open( doc_filepath, "rb" ) )

In [11]:
%%time

word2vec_model_file = 'data/eos/word2vec_model'

if 0 == 1:
    
    # initiate the model and perform the first epoch of training
    word2vec_model = Word2Vec(eos_corpus, size=100, window=5,
                        min_count=10, sg=1, workers=7)
    
    word2vec_model.save(word2vec_model_file)

    # perform another 11 epochs of training
    for i in range(1,12):
        word2vec_model.train(trigram_sentences)
        word2vec_model.save(word2vec_model_file)

# load the finished model from disk
word2vec_model = Word2Vec.load(word2vec_model_file)
word2vec_model.init_sims()

CPU times: user 3.07 s, sys: 56 ms, total: 3.13 s
Wall time: 3.13 s


In [15]:
print ("{} training epochs so far".format(word2vec_model.train_count))
print ("{:,} terms in the word2vec EOS vocabulary.".format(len(word2vec_model.wv.vocab)))


1 training epochs so far
274,448 terms in the word2vec EOS vocabulary.


In [22]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in word2vec_model.most_similar(positive=[token], topn=topn):

        print ("{:20} {}".format(word, round(similarity, 3)))

In [24]:
get_related_terms(u'economic')

economy              0.756
macroeconomic        0.706
growth               0.659
development          0.625
fiscal               0.614
political            0.611
financial            0.603
integration          0.598
geopolitical         0.595
stagnation           0.588


In [25]:
get_related_terms(u'terror')

terrorist            0.826
terrorism            0.726
extremist            0.687
militant             0.575
islamist             0.559
taliban              0.541
deviant              0.54
jihadist             0.528
criminal             0.518
isi                  0.511


In [26]:
get_related_terms(u'baghdad')

erbil                0.715
tikrit               0.709
iraqi                0.682
ramadi               0.664
kirkuk               0.628
baiji                0.624
irbil                0.624
anbar                0.616
kurdistan            0.606
iraq                 0.577


In [27]:
get_related_terms(u'government')

authority            0.743
administration       0.685
however              0.661
adbokasiyang         0.621
regime               0.615
also                 0.615
policy               0.594
party                0.591
umgebaut             0.589
leadership           0.589


In [28]:
get_related_terms(u'islam')

religion             0.759
ideology             0.733
christianity         0.716
religious            0.688
muslim               0.685
atheist              0.626
heretical            0.626
islamism             0.625
wahhabi              0.618
atheism              0.618


In [29]:
get_related_terms(u'health')

healthcare           0.693
medical              0.609
education            0.605
dental               0.592
safety               0.579
medicine             0.574
welfare              0.546
hygiene              0.544
patient              0.54
reproductive         0.54


In [31]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = word2vec_model.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print (term)

In [39]:
word_algebra(add=[u'asad', u'syria'], topn=5)

isi
iraq
daesh
anbar
jihadist


In [35]:
word_algebra(add=[u'iraq', u'syria'], subtract=[u'asad'], topn=5)

jihadist
territory
ukraine
isi
libya


# Visualization TSNE

In [83]:
from sklearn.manifold import TSNE

In [73]:

# build a list of the terms, integer indices,
# and term counts from the word2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in word2vec_model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda tup: -tup[2])


# # unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# # create a DataFrame with the food2vec vectors as data,
# # and the terms as row labels
word_vectors = pd.DataFrame(word2vec_model.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
said,0.073273,-0.152771,0.112993,-0.145611,0.017726,-0.100282,-0.026614,0.043866,0.035405,-0.014093,...,0.071364,0.023075,-0.059849,0.026324,-0.096880,0.106746,0.015405,-0.015209,0.197077,0.029931
turkish,0.126684,0.078956,-0.077397,-0.050055,0.057956,-0.022201,0.188893,0.018750,0.063528,-0.088484,...,-0.053151,-0.107932,0.237991,-0.190947,0.194340,-0.065291,0.139929,-0.041589,0.022966,0.054521
will,0.202406,-0.035474,0.058627,-0.060378,0.147301,-0.138287,0.167897,0.001485,-0.045261,0.050237,...,-0.016491,0.020192,0.077705,0.131084,0.089308,-0.131150,0.008965,0.016316,0.093102,0.187674
year,0.001923,-0.134550,0.181071,-0.070295,-0.050678,0.019678,0.101979,0.221585,0.083154,-0.097437,...,0.021588,-0.001824,-0.093832,-0.010992,0.058595,-0.012500,0.171019,0.119047,0.061335,-0.145381
turkey,0.150996,0.070856,-0.034535,0.139338,0.150169,0.001822,0.179157,0.030174,0.113781,-0.122270,...,-0.132001,-0.089025,0.090802,-0.176142,0.075574,-0.025733,0.034253,0.034351,-0.035753,0.054726
istanbul,0.051401,-0.046313,-0.178318,0.148213,-0.010960,-0.089671,0.156021,-0.014605,-0.012607,0.045229,...,0.010588,-0.139720,0.025558,-0.115174,0.080670,-0.006435,0.075126,-0.020158,-0.039855,0.117749
new,0.187924,-0.091069,-0.008594,0.059864,-0.038952,0.038652,0.132265,-0.022117,-0.012576,0.080607,...,-0.041761,-0.042087,-0.090014,0.204173,0.090621,-0.165065,-0.066924,-0.077638,0.200017,0.062893
people,0.031357,-0.073807,0.110105,-0.011281,-0.070246,0.130107,0.190856,0.074936,0.058899,0.127288,...,0.047607,0.058610,-0.066910,0.101554,-0.002602,-0.016229,-0.017498,-0.017100,-0.082945,0.013153
police,0.153791,-0.066367,0.111115,-0.126523,-0.139653,0.033022,-0.065920,0.004863,0.060302,0.089704,...,-0.007164,0.132520,-0.014438,-0.078930,0.186755,0.054057,0.194822,0.061666,0.051771,-0.038177
also,0.068626,-0.114278,0.205317,-0.035774,0.021560,0.055329,0.073186,0.059801,-0.046261,-0.014221,...,-0.009778,-0.030128,0.055015,0.017723,0.023089,0.128793,-0.081448,-0.009987,0.154708,0.007938


In [79]:
tsne_input = word_vectors.drop(spacy.en.English.Defaults.stop_words, errors=u'ignore')
tsne_input = tsne_input.head(5000)


In [80]:
tsne_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
said,0.073273,-0.152771,0.112993,-0.145611,0.017726,-0.100282,-0.026614,0.043866,0.035405,-0.014093,...,0.071364,0.023075,-0.059849,0.026324,-0.09688,0.106746,0.015405,-0.015209,0.197077,0.029931
turkish,0.126684,0.078956,-0.077397,-0.050055,0.057956,-0.022201,0.188893,0.01875,0.063528,-0.088484,...,-0.053151,-0.107932,0.237991,-0.190947,0.19434,-0.065291,0.139929,-0.041589,0.022966,0.054521
year,0.001923,-0.13455,0.181071,-0.070295,-0.050678,0.019678,0.101979,0.221585,0.083154,-0.097437,...,0.021588,-0.001824,-0.093832,-0.010992,0.058595,-0.0125,0.171019,0.119047,0.061335,-0.145381
turkey,0.150996,0.070856,-0.034535,0.139338,0.150169,0.001822,0.179157,0.030174,0.113781,-0.12227,...,-0.132001,-0.089025,0.090802,-0.176142,0.075574,-0.025733,0.034253,0.034351,-0.035753,0.054726
istanbul,0.051401,-0.046313,-0.178318,0.148213,-0.01096,-0.089671,0.156021,-0.014605,-0.012607,0.045229,...,0.010588,-0.13972,0.025558,-0.115174,0.08067,-0.006435,0.075126,-0.020158,-0.039855,0.117749


In [81]:
tsne_filepath = 'data/eos/tsne/tsne_model'
tsne_vectors_filepath = 'data/eos/tsne/tsne_vectors.npy'

In [90]:
%%time


# import ipdb
# ipdb.set_trace() # debugging starts here

if 1 == 1:
    
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    with open(tsne_filepath, 'wb') as f:
        pickle.dump(tsne, f)
        
    print('done...')

    pd.np.save(tsne_vectors_filepath, tsne_vectors)
    

# with open(tsne_filepath) as f:
#     tsne = pickle.load(f)
    
# tsne_vectors = pd.np.load(tsne_vectors_filepath)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=[u'x_coord', u'y_coord'])

done...
CPU times: user 49 s, sys: 3.68 s, total: 52.7 s
Wall time: 52 s


In [91]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
said,9.861187,1.544665
turkish,-7.126476,4.149703
year,-1.138467,9.993373
turkey,-7.104721,4.122176
istanbul,-5.193318,-5.395527


In [92]:
tsne_vectors[u'word'] = tsne_vectors.index

In [94]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

In [95]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, resize, reset'),
                   active_scroll=u'wheel_zoom')

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [98]:
# Another view
# X = word2vec_model[word2vec_model.wv.vocab]

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X)

# plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
# plt.show()