In [6]:
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import euclidean_distances
from sklearn.externals.joblib import Parallel, delayed
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import check_array

from sklearn.preprocessing import normalize
from pyemd import emd
import gensim
from gensim.models import KeyedVectors
import json

In [7]:

# model = gensim.models.KeyedVectors.load_word2vec_format('C:/Users/evannnn/word_embedding/GoogleNews-vectors-negative300.bin', binary=True)
# print("model loaded")

def load_from_json(file_name):
    with open(file_name, "r") as fp:
            json_file=json.load(fp)
    return json_file
  

In [29]:
# load processed movie and book summary data
movie_id_to_summary=load_from_json("movie_summary_not_stemmed.json")
book_id_to_summary=load_from_json("book_summary_not_stemmed.json")

In [9]:
vocab=np.load("vocab.npy")
W=np.load("embeddings.npy")

In [101]:
W=np.array(W,dtype=np.double)

In [10]:
vocab_dict={word:i for i,word in enumerate(vocab)}

In [12]:
vocab_dict['word']

1563

In [20]:
n_moview=len(moive_id_to_summary)
n_books=len(book_id_to_summary)
print(n_moview,n_books)

4317 2065


In [21]:
sample_m=moive_id_to_summary['1']
sample_b=book_id_to_summary['1']

In [23]:
class WordMoversKNN(KNeighborsClassifier):
    """K nearest neighbors classifier using the Word Mover's Distance.

    Parameters
    ----------
    
    W_embed : array, shape: (vocab_size, embed_size)
        Precomputed word embeddings between vocabulary items.
        Row indices should correspond to the columns in the bag-of-words input.

    n_neighbors : int, optional (default = 5)
        Number of neighbors to use by default for :meth:`k_neighbors` queries.

    n_jobs : int, optional (default = 1)
        The number of parallel jobs to run for Word Mover's Distance computation.
        If ``-1``, then the number of jobs is set to the number of CPU cores.
    
    verbose : int, optional
        Controls the verbosity; the higher, the more messages. Defaults to 0.
        
    References
    ----------
    
    Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger
    From Word Embeddings To Document Distances
    The International Conference on Machine Learning (ICML), 2015
    http://mkusner.github.io/publications/WMD.pdf
    
    """
    _pairwise = False

    def __init__(self, W_embed, n_neighbors=5, n_jobs=4, verbose=False):
        self.W_embed = W_embed
        self.verbose = verbose
        super(WordMoversKNN, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs,
                                            metric='precomputed', algorithm='brute')

    def _wmd(self, i, row, X_train):
        """Compute the WMD between training sample i and given test row.
        
        Assumes that `row` and train samples are sparse BOW vectors summing to 1.
        """
        union_idx = np.union1d(X_train[i].indices, row.indices)
        W_minimal = self.W_embed[union_idx]
        W_dist = euclidean_distances(W_minimal)
        bow_i = X_train[i, union_idx].A.ravel()
        bow_j = row[:, union_idx].A.ravel()
        return emd(bow_i, bow_j, W_dist)
    
    def _wmd_row(self, row, X_train):
        """Wrapper to compute the WMD of a row with all training samples.
        
        Assumes that `row` and train samples are sparse BOW vectors summing to 1.
        Useful for parallelization.
        """
        n_samples_train = X_train.shape[0]
        return [self._wmd(i, row, X_train) for i in range(n_samples_train)]

    def _pairwise_wmd(self, X_test, X_train=None):
        """Computes the word mover's distance between all train and test points.
        
        Parallelized over rows of X_test.
        
        Assumes that train and test samples are sparse BOW vectors summing to 1.
        
        Parameters
        ----------
        X_test: scipy.sparse matrix, shape: (n_test_samples, vocab_size)
            Test samples.
        
        X_train: scipy.sparse matrix, shape: (n_train_samples, vocab_size)
            Training samples. If `None`, uses the samples the estimator was fit with.
        
        Returns
        -------
        dist : array, shape: (n_test_samples, n_train_samples)
            Distances between all test samples and all train samples.
        
        """
        n_samples_test = X_test.shape[0]
        
        if X_train is None:
            X_train = self._fit_X

        dist = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
            delayed(self._wmd_row)(test_sample, X_train)
            for test_sample in X_test)

        return np.array(dist)

In [24]:
WMD=WordMoversKNN(vocab)

In [40]:
from functools import reduce #python 3


In [41]:
" ".join(reduce(lambda x,y: x+y,movie_id_to_summary['1']))

'milo boyd former nypd detective works bail enforcement agent milo ex wife nicole hurley investigative reporter arrested assaulting police officer nicole receives tip story working apparent suicide may actually murder skips bond hearing meet informant causing judge revoke bail issue warrant arrest unfortunately nicole arrives informant jimmy kidnapped milo ecstatic nicole bail bondsman sid offers job tracking nicole bringing jail bounty questioning nicole mother kitty milo apprehends race track new jersey throws car trunk drives back towards manhattan nicole manages escape briefly catches meanwhile neither aware stalked milo two thugs sent bookie named irene outstanding gambling debts nicole criminal earl mahler connected story investigating nicole lovestruck coworker stuart bent rescuing mahler catches tries kill nicole two narrowly escape milo interested explanations nicole admits found evidence implicates mutual friend bobby milo ex partner police force angry milo decides investigat

In [71]:
movie_corpus=[]
for idx in list(movie_id_to_summary.keys()):
    try:
        
        movie_corpus.append(" ".join(reduce(lambda x,y: x+y,movie_id_to_summary[idx])))
    except:
        
        movie_corpus.append("")


len(movie_corpus)

4317

In [75]:

book_corpus= [" ".join(reduce(lambda x,y: x+y,book_id_to_summary[idx]))  if book_id_to_summary[idx] is not None else "" for idx in list(book_id_to_summary.keys()) ]


In [125]:
vect=CountVectorizer(stop_words=None,dtype=np.double).fit(movie_corpus + book_corpus)

In [126]:
v_1 = vect.transform(sample_m[0])
print(sum(v_1.toarray().ravel()))

v_2 = vect.transform(sample_b[0])
print(sum(v_2.toarray().ravel()))

9.0
9.0


In [113]:
union_idx = np.union1d(v_1.indices, v_2.indices)
print(union_idx.shape)
W_dist=euclidean_distances(W[union_idx])
print(W_dist.shape)

(21,)
(21, 21)


In [107]:
W_dist

array([[0.        , 4.33150786, 3.69190607, 3.51573346, 4.40742536,
        4.2406966 , 3.66776426, 5.32796863, 4.6076104 , 3.87366043,
        4.03292041, 3.68232048, 3.97739283, 4.07089836, 3.78574493,
        4.29225462, 3.90000092, 4.21737327, 4.61094032, 3.78487026,
        4.0699112 ],
       [4.33150786, 0.        , 4.52605024, 4.43754657, 4.79339845,
        4.65680189, 4.32836976, 5.68533231, 5.12321183, 4.64562881,
        4.54897008, 4.68505573, 4.48915913, 5.04203627, 4.56628516,
        4.71209435, 4.57461057, 4.80265174, 5.32921156, 4.60514251,
        4.7585135 ],
       [3.69190607, 4.52605024, 0.        , 3.83930907, 4.27006278,
        4.19449884, 4.00830852, 5.50236743, 4.84945022, 4.16319906,
        4.11225078, 4.19786522, 4.11111468, 4.60681817, 4.03979163,
        4.18830452, 4.06176031, 4.4794481 , 4.6248163 , 4.25884557,
        4.22798775],
       [3.51573346, 4.43754657, 3.83930907, 0.        , 4.3537409 ,
        4.22897121, 3.78518716, 5.26882942, 4.6928491

In [110]:
e1=v_1[:,union_idx].A.ravel()

e2=v_2[:,union_idx].A.ravel()


In [124]:
v_1[:,union_idx].A

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]])

In [119]:
len(e1)

189