#### Cross Embedding Alignment

Below is code that aligns the dimensions of multiple embeddings arrayed over time or some other dimension and allow identification of semantic change as the word vectors change their loadings for focal words. This code comes from the approach piloted at Stanford by William Hamilton, Daniel Jurafsky and Jure Lescovec [here](https://arxiv.org/pdf/1605.09096.pdf). 

In this case we train the models ourselves instead of using a pre-trained model, so this might take some time.

In [1]:
import pandas as pd
import gensim
import copy
import numpy as np

In [2]:
import gensim.models as models

In [3]:
def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
#     base_embed.init_sims(replace=True)
#     other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # re-filling the normed vectors
    in_base_embed.fill_norms(force=True)
    in_other_embed.fill_norms(force=True)

    
    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.get_normed_vectors()
    other_vecs = in_other_embed.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.vectors = (other_embed.vectors).dot(ortho)    
    
    return other_embed

def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.index_to_key)
    vocab_m2 = set(m2.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.get_vecattr(w, "count") + m2.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.key_to_index[w] for w in common_vocab]
        old_arr = m.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.key_to_index = new_key_to_index
        m.index_to_key = new_index_to_key
        
        print(len(m.key_to_index), len(m.vectors))
        
    return (m1,m2)

In [4]:
def compareModels(df, category, sort = True):
    """If you are using time as your category sorting is important"""
    embeddings_raw = {}
    cats = sorted(set(df[category]))
    for cat in cats:
        #This can take a while
        print("Embedding {}".format(cat), end = '\r')
        subsetDF = df[df[category] == cat]
        #You might want to change the W2V parameters
        embeddings_raw[cat] = gensim.models.word2vec.Word2Vec(subsetDF['Body'].sum())#(subsetDF['normalized_sents'].sum())
    #These are much quicker
    embeddings_aligned = {}
    for catOuter in cats:
        embeddings_aligned[catOuter] = [embeddings_raw[catOuter]]
        for catInner in cats:
            embeddings_aligned[catOuter].append(smart_procrustes_align_gensim(embeddings_aligned[catOuter][-1], embeddings_raw[catInner]))
    return embeddings_raw, embeddings_aligned

In [5]:
def compareModels_pretrained(embeddings_raw):
    cats = list(embeddings_raw.keys())
    embeddings_aligned = {}
    for catOuter in cats:
        embeddings_aligned[catOuter] = [embeddings_raw[catOuter]]
        for catInner in cats:
            embeddings_aligned[catOuter].append(smart_procrustes_align_gensim(embeddings_aligned[catOuter][-1], embeddings_raw[catInner]))
    return embeddings_aligned

NOTE: gere the ascoDF is a pandas datafram with normalized texts, and a category. The compareModels code uses this dataframe. You can download the data [here](https://drive.google.com/file/d/1R9EiThdJQ3vY84xcoPJGUBeqvemu_o8L/view?usp=sharing).

If we already have pre-trained models, you can use the compareModels_pretrained function, which takes a dictionary in this format: {"category_0":gensim_model_0, "category_1":gensim_model_1, ...}

In [6]:
w2vmodel_gnews_slim = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300-SLIM.bin', binary= True)

### Creating Utopian Embedding Model

In [7]:
import json

In [8]:
from gensim.models import Word2Vec

In [9]:
with open('Cleaned-Data/cleaned_texts_pdf.json') as json_file:
    cleaned_texts_pdf = json.load(json_file)

In [10]:
with open('Cleaned-Data/cleaned_texts_epub_txt.json') as json_file:
    cleaned_texts_epub_txt = json.load(json_file)

In [11]:
all_cleaned_texts_utopia = []

In [12]:
for text in cleaned_texts_pdf:
    all_cleaned_texts_utopia.append(cleaned_texts_pdf[text])

In [13]:
for text in cleaned_texts_epub_txt:
    all_cleaned_texts_utopia.append(cleaned_texts_epub_txt[text])

In [14]:
w2vmodel_utopia = Word2Vec(
        all_cleaned_texts_utopia,
        vector_size=300,
        window=15)

In [15]:
w2vmodel_utopia.wv.most_similar("planet")

[('planets', 0.8591040372848511),
 ('galaxy', 0.8357036113739014),
 ('astronomers', 0.7555447816848755),
 ('worlds', 0.7504953742027283),
 ('k_pax', 0.7449854612350464),
 ('planetwide', 0.7405986785888672),
 ('earth', 0.7392650842666626),
 ('thalassa', 0.7343885898590088),
 ('planetary', 0.725549578666687),
 ('moons', 0.7245914340019226)]

### Bootstrapped model

In [16]:
book_lens = []

In [17]:
for text in all_cleaned_texts_utopia:
    book_lens.append(len(text))

In [18]:
sns.distplot(book_lens)

NameError: name 'sns' is not defined

In [None]:
from sklearn.utils import resample

In [None]:
bootstrap_texts = resample(all_cleaned_texts_utopia)

In [None]:
w2vmodel_utopia_b = Word2Vec(
        bootstrap_texts,
        vector_size=300,
        window=15)

In [None]:
w2vmodel_utopia_b.wv.most_similar("planet")

## Comparing with Google News

Note: we have to use the keyed vectors to get it to work. The returned dict is bloated, we only need one pair of aligned models.

In [None]:
w2vmodel_utopia_wv = w2vmodel_utopia.wv

In [None]:
model_dict = {"utopia":w2vmodel_utopia_wv, "google":w2vmodel_gnews_slim}

In [None]:
comparedEmbeddings = compareModels_pretrained(model_dict)

In [None]:
comparedEmbeddings

In [None]:
def embed_distance(word):
    val = 1 - metrics.pairwise.cosine_similarity(comparedEmbeddings['utopia'][2][word].reshape(1, -1), comparedEmbeddings['utopia'][1][word].reshape(1, -1))
    return val

In [None]:
embed_distance('justice')

In [None]:
embed_distance('planet')

In [None]:
embed_distance('hello')

In [None]:
embed_distance('orange')

In [None]:
embed_distance('factory')

In [None]:
all_dists = {}

In [None]:
all_words = w2vmodel_utopia_wv.index_to_key

In [None]:
for word in all_words:
    all_dists[word] = embed_distance(word)

In [None]:
import seaborn as sns

In [None]:
sns.distplot(list(all_dists.values()))

In [None]:
sorted(all_dists.items(), key=lambda x: x[1])[:30]


In [None]:
sorted(all_dists.items(), key=lambda x: x[1], reverse=True)[:30]


In [None]:
w2vmodel_utopia_wv.most_similar('huckstering')

In [None]:
def embed_distance(model, word1, word2):
    val = 1 - metrics.pairwise.cosine_similarity(model[word1].reshape(1, -1), model[word2].reshape(1, -1))
    return val

In [None]:
embed_distance(w2vmodel_utopia_wv, "woman", "scientist")

In [None]:
embed_distance(w2vmodel_gnews_slim, "woman", "scientist")

In [None]:
embed_distance(w2vmodel_utopia_wv, "woman", "scientist")

## Embed Time Heatplots

This is the key to our aligning - the smart procrustes align method here is the one doing the aligning, and we are adding the alligned embeddings to the dictionary. 

We need to compare them across all permutions so we will define another function to help, we will be using 1 - cosine similarity as that gives a more intitive range of 0-2 with low values meaning little change and high meaning lots of change.

In [None]:
def getDivergenceDF(word, embeddingsDict):
    dists = []
    cats = sorted(set(embeddingsDict.keys()))
    dists = {}
    for cat in cats:
        dists[cat] = []
        for embed in embeddingsDict[cat][1:]:
            dists[cat].append(np.abs(1 - metrics.pairwise.cosine_similarity(embeddingsDict[cat][0][word].reshape(1, -1),
                                                                             embed[word].reshape(1, -1))[0,0]))
    return pd.DataFrame(dists, index = cats)

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn
from sklearn import metrics

We now check certain words and see their movement in time along these abstracts.

In [None]:
targetWord = 'hello'

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

In [None]:
targetWord = 'combination'

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

In [None]:
def findDiverence(word, embeddingsDict):
    cats = sorted(set(embeddingsDict.keys()))
    
    dists = []
    for embed in embeddingsDict[cats[0]][1:]:
        dists.append(1 - sklearn.metrics.pairwise.cosine_similarity(embeddingsDict[cats[0]][0][word].reshape(1, -1), embed[word].reshape(1, -1))[0,0])
    return sum(dists)

def findMostDivergent(embeddingsDict):
    words = []
    for embeds in embeddingsDict.values():
        for embed in embeds:
            words += list(embed.wv.vocab.keys())
    words = set(words)
    print("Found {} words to compare".format(len(words)))
    return sorted([(w, findDiverence(w, embeddingsDict)) for w in words], key = lambda x: x[1], reverse=True)
    

In [None]:
wordDivergences = findMostDivergent(comparedEmbeddings)

In [None]:
wordDivergences[:20]

In [None]:
wordDivergences[-20:]

In [None]:
targetWord = wordDivergences[2][0]

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()

In [None]:
targetWord = wordDivergences[-1][0]

pltDF = getDivergenceDF(targetWord, comparedEmbeddings)
fig, ax = plt.subplots(figsize = (10, 7))
seaborn.heatmap(pltDF, ax = ax, annot = False) #set annot True for a lot more information
ax.set_xlabel("Starting year")
ax.set_ylabel("Final year")
ax.set_ylabel("Final year")
ax.set_title("Yearly linguistic change for: '{}'".format(targetWord))
plt.show()