# Visualisation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import collections

from sklearn.manifold import TSNE 

from gensim.models import Word2Vec

In [None]:
# load models
model_rise = Word2Vec.load('model_rise')
model_stable = Word2Vec.load('model_stable')
model_peak = Word2Vec.load('model_peak')

In [None]:
# function to align the models
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    # base_embed.init_sims(replace=True)
    # other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [None]:
# make copies of one model, model_rise, to avoid changes in the original during alignment
model_w2v_new = Word2Vec(sg = 1, vector_size=120)
model_w2v_new.build_vocab(set(model_rise.wv.index_to_key))

model_w2v_new.wv.vectors = model_rise.wv.vectors

model_w2v_new2 = Word2Vec(sg = 1, vector_size=120)
model_w2v_new2.build_vocab(set(model_rise.wv.index_to_key))

model_w2v_new2.wv.vectors = model_rise.wv.vectors

model_w2v_new.reset_from(model_rise)
model_w2v_new2.reset_from(model_rise)

In [None]:
# align model_peak and model_stable with the copies of model_rise
model_peak_modified = smart_procrustes_align_gensim(model_w2v_new
                                                    , model_peak
                                                   )

model_stable_modified = smart_procrustes_align_gensim(model_w2v_new2
                                                    , model_stable
                                                   )

In [None]:
def fit_tsne(values):
    if not values:
        return 

    mat = np.array(values)
    model = TSNE(n_components=2, random_state=0, learning_rate=150, init='pca')
    fitted = model.fit_transform(mat)

    return fitted

In [None]:
def get_time_sims(models, word1):

    lookups = {}
    sims = {}
    for year, embed in models.items():
        for word, sim in embed.wv.most_similar(word1, topn=10):
            ww = "%s|%s" % (word, year)
            if sim > 0.3:            
                lookups[ww] = embed.wv[word]
                sims[ww] = sim

    return lookups, sims

In [None]:
def plot_words(word1, words, fitted, cmap, sims):
    # TODO: remove this and just set the plot axes directly
    plt.scatter(fitted[:,0], fitted[:,1], alpha=0)
    plt.suptitle("%s" % word1, fontsize=30, y=0.1)
    plt.axis('off')

    annotations = []
    colors = {1970:0.2, 1990:0.5,2000:0.7}
    for i in range(len(words)):
        pt = fitted[i]

        ww,decade = [w.strip() for w in words[i].split("|")]

        if ww == word1:
            annotations.append((ww, decade, pt))
            word = word1+" ("+decade+")"
            color = 'black'
            sizing = 20
            print(ww + ": annotations")
        else:
            # word1 is the word we are plotting against
            #color = cmap((int(decade)-1960)/100 + int(decade)/10000)
            color = cmap(colors[int(decade)])
            word = ww
            sizing = sims[words[i]] * 30

        plt.text(pt[0], pt[1], word, size=int(sizing), color=color)

    return annotations        

def plot_annotations(annotations):
    # draw the movement between the word through the decades as a series of
    # annotations on the graph
    annotations.sort(key=lambda w: w[1], reverse=True)

    prev = annotations[0][-1]
    for ww, decade, ann in annotations[1:]:
        plt.annotate('', xy=prev, xytext=ann,
            arrowprops=dict(facecolor='grey', shrink=0.1, alpha=0.3,width=2, headwidth=10))
        prev = ann

In [None]:
# create the plot of the semantic changes of ecologie for the three epochs
models = {1970: model_peak_modified, 1990:model_rise, 2000:model_stable_modified}
word1 = "écologie"

lookups, sims = get_time_sims(models, word1)

for decade, model in models.items():
    ww = "%s|%s" % (word1, decade)
    lookups[ww]=model.wv[word1]
    
words = list(lookups.keys())
values = [lookups[word] for word in words ]
fitted = fit_tsne(values)

cmap = plt.cm.get_cmap("jet", len(sims))

plt.figure(figsize=(15,15))

annotations = plot_words(word1, words, fitted, cmap, sims)
if annotations:
    plot_annotations(annotations)

plt.savefig("%s_chain.png" % word1)