# generate lists of words to explain the clusters

### Jan 9 2020, Brendan Chambers

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

import pymysql
import pickle
import json

import numpy as np
from sklearn.decomposition import PCA
import networkx as nx

import hdbscan
import umap

import time
import random
import re

import spacy
from gensim.models import Word2Vec

from pmids2vec import pmids2vec
from pmids2corpus import pmids2corpus


### load in word2vec models trained on each coarse cluster

In [None]:
# temp, recreate the list of file names without re-training the w2v models

model_names = ['data_processing/cluster0.model',
        'data_processing/cluster1.model',
        'data_processing/cluster2.model']
print(model_names)
print()

models = {}
for model_name in model_names:
    
    short_name = str.split(model_name,'/')[-1]
    print(short_name)
    
    model = Word2Vec.load(model_name)
    models[short_name] = model
    print('{} words '.format(len(model.wv.vocab)))
    print()
    
#  ~16,000 words across all 3 clusters (> 25 times)
#  ~3,000 words occurring in all 3 clusters (> 25 times)
#  window size 15

In [None]:
# build a networkx graph for each word similarity network

In [None]:
# todo use unique vocabularies for starters

# todo match vocabulary sizes


nx_graphs = []
for idx, model in enumerate(models.values()):
    
    # the number of nodes is small so we can do this with mat mul
    D = np.shape(model.wv[common_vocabulary[0]])[0]
    
    embedding_samples = np.zeros( (len(model.wv.vocab), D) )
    
    # collect samples
    for i_word, word in enumerate(model.wv.vocab):
        embedding_samples[i_word,:] = model.wv[word]
        
    # reduce dimensionality
    #D_umap = 5
    #reducer = umap.UMAP(n_components=D_umap)
    #um = reducer.fit_transform(embedding_samples)    # concatenated
    #embedding_samples = um
    
    # demean
    for i_word, word in enumerate(model.wv.vocab):
        embedding_samples[i_word,:] -= np.mean(embedding_samples,0)
    
    # normalize all vectors to the hypersphere
    for i_row,row in enumerate(embedding_samples):
        embedding_samples[i_row,:] /= np.linalg.norm(row,2)
        
    # compute cosine similarity (projection of normalized vectors)
    W = np.matmul(embedding_samples, embedding_samples.T)
    np.fill_diagonal(W, 0)  # no self-loops
    thresh_low = np.percentile(W.flatten(), 1)
    thresh_high = np.percentile(W.flatten(), 98)
    #boolean_mask = np.logical_and(W < thresh_high, W > thresh_low)
    boolean_mask = W < thresh_high
    W[boolean_mask] = 0 # mask out some weights for testing
    print(np.shape(W))
    
    plt.figure()
    plt.imshow(W)
    plt.colorbar()
    
    plt.figure()
    plt.hist(np.tril(W,k=-1)[np.tril(W,k=-1).nonzero()].flatten(),histtype='step', bins=100)
    
    #W_binary = (W > 0).tolist()
    #ggg[idx] = igraph.Graph.Adjacency(W_binary)  # define connections
    #ggg[idx].es['weight'] = W[W.nonzero()]
    #G_igraph = igraph.Graph.Weighted_Adjacency(W.tolist(),
    #                                           mode=igraph.ADJ_UNDIRECTED,
    #                                          loops=False,
    #                                          attr='weight')
    #igraphs.append(G_igraph)
    
    # create networkx version
    G_nx = nx.from_numpy_matrix(W)  # create using - 
    nx.set_node_attributes(G_nx,
                           dict((i,w) for i,w in enumerate(model.wv.vocab)),
                           'word')
    d_eigcent = nx.eigenvector_centrality(G_nx, max_iter=200)
    nx.set_node_attributes(G_nx, d_eigcent, 'eigenvector_centrality')
    nx_graphs.append(G_nx)
    