In [1]:
#!/usr/bin/env python
# coding: utf-8

import os, sys, gzip, csv
from glob import glob

# core nltk
import nltk
from nltk.tokenize import word_tokenize

# gensim magic
import gensim
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

import numpy as np
import collections

import matplotlib.pyplot as plt
%matplotlib inline

# for PCA
from sklearn.decomposition import PCA

# Linear model
from sklearn.linear_model import LinearRegression

# cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# load saved models in order
eebo_models = list()
input_data = [
    "../models/eebo-henry_VIII.w2v",
    "../models/eebo-henry_VII.w2v",
    "../models/eebo-edward_VI.w2v",
    "../models/eebo-mary_I.w2v",
    "../models/eebo-elizabeth_I.w2v",
    "../models/eebo-james_I.w2v",
    "../models/eebo-charles_I.w2v",
    "../models/eebo-oliver_cromwell.w2v",
    "../models/eebo-charles_II.w2v"
]    

for fp in input_data:
    
    model_name = os.path.basename(fp).split(".")[0]
    print("starting: {0}".format(model_name))
    eebo_models.append([model_name,KeyedVectors.load(fp,mmap='r')])

starting: eebo-henry_VIII
starting: eebo-henry_VII
starting: eebo-edward_VI
starting: eebo-mary_I
starting: eebo-elizabeth_I
starting: eebo-james_I
starting: eebo-charles_I
starting: eebo-oliver_cromwell
starting: eebo-charles_II


In [4]:
def get_change_all(model_a, model_b):

    # extract all common vocab
    common_vocab = [word for word in model_a.vocab if word in model_b.vocab]
    common_vectors_a = model_a[common_vocab]
    common_vectors_b = model_b[common_vocab]
    
    # now use Sklearn's LinearRegression to combine vector space
    lin_model = LinearRegression()
    lin_model.fit(common_vectors_a, common_vectors_b)
    
    shared_vectors = collections.OrderedDict()
    
    for word in model_a.vocab:
        word_vector = lin_model.predict(model_a[word].reshape(1, -1))
        shared_vectors[word] = word_vector.reshape(-1)
    
    # now add words only in model b
    #for word in [word for word in model_b.vocab if word not in common_vocab]:
    #    shared_vectors[word] = model_b[word]
        
    vocab = list(shared_vectors.keys())
    shared_embeddings = np.array(list(shared_vectors.values()))
    
    distances = list()
    for word in vocab:
        idx = vocab.index(word)
        distances.append([word,
                          float(cosine_similarity(model_a[word].reshape(1,-1),
                                            shared_embeddings[idx].reshape(1,-1)))])    
    return(distances)

In [5]:
def write_csv(data,model):
    csvfile = gzip.open('../models/' + str(model) + '-drift.csv.gz', 'wt')
    writer = csv.writer(csvfile, delimiter=',')
    for w, d in data:
        writer.writerow([w,d])

In [9]:
# iterate through models
for i, model in enumerate(eebo_models):
    if i >= 1:
        print(eebo_models[i])
        #data = get_change_all(eebo_models[i -1][1], eebo_models[i][1])
        #write_csv(data,eebo_models[i][0])

['eebo-henry_VII', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe253261fd0>]
['eebo-edward_VI', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe253261f90>]
['eebo-mary_I', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe24b3e4750>]
['eebo-elizabeth_I', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe24179f710>]
['eebo-james_I', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe20f485ed0>]
['eebo-charles_I', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe247ab4050>]
['eebo-oliver_cromwell', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe24b4a21d0>]
['eebo-charles_II', <gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x7fe20f485e90>]
