<a href="https://colab.research.google.com/github/ale0xb/keywords-vis/blob/master/keywords_vis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install numpy scipy nltk sklearn iteration_utilities



In [0]:
!pip freeze > requirements.txt

In [5]:
!mkdir datasets
!mkdir model
!curl -O https://raw.githubusercontent.com/ale0xb/keywords-vis/master/datasets/dh_papers-complete.json 
!curl -O https://raw.githubusercontent.com/ale0xb/keywords-vis/master/datasets/vispubdata_papers-2018-complete.json
!curl -O https://raw.githubusercontent.com/ale0xb/keywords-vis/master/model/all_paths.pkl
  
!mv dh_papers-complete.json ./datasets
!mv vispubdata_papers-2018-complete.json ./datasets/
!mv all_paths.pkl ./model/

mkdir: cannot create directory ‘datasets’: File exists
mkdir: cannot create directory ‘model’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 87693  100 87693    0     0   457k      0 --:--:-- --:--:-- --:--:--  457k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  871k  100  871k    0     0  3599k      0 --:--:-- --:--:-- --:--:-- 3585k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    15  100    15    0     0     93      0 --:--:-- --:--:-- --:--:--    94


In [13]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


import scipy
import scipy.sparse as sps
from scipy.spatial.distance import pdist, squareform
from scipy.spatial import ConvexHull
from scipy import stats

from collections import Counter



from sklearn.preprocessing import normalize



import shutil
import pickle
import itertools
import copy
from iteration_utilities import flatten
import json

import networkx as nx
from networkx.readwrite import json_graph

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stem, unigrams, skipgrams

In [0]:
def remove_duplicates(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]
  
def stem_keywords(keywords):
  terms_dict = {}
  keywords_stemmed = []
  stems_dictionary = {}
  stems_dictionary_inv = {}

  for doc in keywords:
    doc_stemmed = []
    for k in doc:
      for tok in k.split():
        if tok in stop_words:
          continue
	      
        stem_tok = stemmer.stem(tok)
#         stem_tok = wnl.lemmatize(tok)
        if stem_tok in stop_stems:
          continue

        if stem_tok not in stems_dictionary:
          stems_dictionary[stem_tok] = [tok]
        elif tok not in stems_dictionary[stem_tok]:
          stems_dictionary[stem_tok].append(tok)
          
        if k not in stems_dictionary_inv:
          stems_dictionary_inv[tok] = [stem_tok]
        elif stem_tok not in stems_dictionary_inv[tok]:
          stems_dictionary_inv[tok].append(stem_tok)

        doc_stemmed.append(stem_tok)
    keywords_stemmed.append(remove_duplicates(doc_stemmed))

  return (keywords_stemmed, stems_dictionary, stems_dictionary_inv)

In [0]:
def trace_stem(keyword_stem):
  if keyword_stem in trace_stem.intersection_keywords_set:
    return 'link'
  elif keyword_stem in trace_stem.source_keywords_set:
    return 'query'
  elif keyword_stem in trace_stem.dest_keywords_set:
    return 'target'
  else:
    raise RuntimeError('Error: keyword %s could not be traced' % keyword_stem)

In [0]:
def build_unigrams(keywords, verbose=False):
  unigram_counts = Counter()
  
  for ii, doc in enumerate(keywords):
    if ii % 2000 == 0 and verbose:
      print(f'finished {ii/len(keywords):.2%} of papers')
    for keyword in doc:
      unigram_counts[keyword] += 1
            
  print('done')
  print('vocabulary size: {}'.format(len(unigram_counts)))
  print('most common: {}'.format(unigram_counts.most_common(10)))
  
  return unigram_counts

In [0]:
def build_td_mat(keywords, keyword2indx):
  doc_indxs = []
  keyword_indxs = []
  tdat_values = []
  for ii, doc in enumerate(keywords):
    for keyword in doc:
      doc_indxs.append(ii)
      keyword_indxs.append(keyword2indx[keyword])
      tdat_values.append(True)

  return sps.csr_matrix((tdat_values, (doc_indxs, keyword_indxs)))

In [0]:
def build_skipgrams(keywords_stemmed, verbose=False):
	n_docs = len(keywords_stemmed)
	skipgram_counts = Counter()
	for idoc, doc in enumerate(keywords_stemmed):
		for keyword_a, keyword_b in itertools.combinations(doc, 2):
			skipgram_a = (keyword_a, keyword_b)
			skipgram_b = (keyword_b, keyword_a)
			skipgram_counts[skipgram_a] += 1
			skipgram_counts[skipgram_b] += 1
		if idoc % 1000 == 0 and verbose:
			print(f'finished {idoc/n_docs:.2%} of documents')

	print('done')
	print('number of skipgrams: {}'.format(len(skipgram_counts)))
	print('most common: {}'.format(skipgram_counts.most_common(20)))

	return skipgram_counts

# Build matrices

In [0]:
def build_wordcounts(skipgram_counts, keyword2indx, verbose=False):
	row_indxs = []
	col_indxs = []
	dat_values = []
	ii = 0
	for (keyword1, keyword2), sg_count in skipgram_counts.items():
		ii += 1
		if ii % 1000 == 0 and verbose:
			print(f'finished {ii/len(skipgram_counts):.2%} of skipgrams')
		key1_indx = keyword2indx[keyword1]
		key2_indx = keyword2indx[keyword2]
        
		row_indxs.append(key1_indx)
		col_indxs.append(key2_indx)
		dat_values.append(sg_count)
    
	wwcnt_mat = sps.csr_matrix((dat_values, (row_indxs, col_indxs)))
	print('wwcnt sparse matrix was built (%d, %d)' % wwcnt_mat.shape)
	return wwcnt_mat

In [0]:
def build_pmis(wwcnt_mat, skipgram_counts, keyword2indx, verbose=False):
  num_skipgrams = wwcnt_mat.sum()
  assert(sum(skipgram_counts.values())==num_skipgrams)

  # for creating sparce matrices
  row_indxs = []
  col_indxs = []
  sppmi_values = []
  
  # smoothing
  alpha = 0.95
  nca_denom = np.sum(np.array(wwcnt_mat.sum(axis=0)).flatten()**alpha)
  sum_over_words = np.array(wwcnt_mat.sum(axis=0)).flatten()
  sum_over_words_alpha = sum_over_words**alpha
  sum_over_contexts = np.array(wwcnt_mat.sum(axis=1)).flatten()
  
  ii = 0
  for (keyword1, keyword2), sg_count in skipgram_counts.items():
    ii += 1
    if ii % 5000 == 0 and verbose:
      print(f'finished {ii/len(skipgram_counts):.2%} of skipgrams')
		
    keyword1_indx = keyword2indx[keyword1]
    keyword2_indx = keyword2indx[keyword2]
    
    nwc = sg_count
    Pwc = nwc / num_skipgrams
    nw = sum_over_contexts[keyword1_indx]
    Pw = nw / num_skipgrams

    nc = sum_over_words[keyword2_indx]
    Pc = nc / num_skipgrams
    
    nca = sum_over_words_alpha[keyword2_indx]
    Pca = nca / nca_denom
    
    sppmi = max(np.log2(Pwc/(Pw*Pca)), 0)
    
    row_indxs.append(keyword1_indx)
    col_indxs.append(keyword2_indx)
    sppmi_values.append(sppmi)
    
    
  sppmi_mat = sps.csr_matrix((sppmi_values, (row_indxs, col_indxs)))
  
  print('sparse ppmi matrix was built')

  return sppmi_mat

# SVD and dense word vectors

In [0]:
def svd_reduce(pmi_use, embedding_size):
	print('Will reduce matrix of shape (%d, %d) with embedding_size=%d' %(pmi_use.shape[0], pmi_use.shape[1], embedding_size))
	return sps.linalg.svds(pmi_use, embedding_size)

In [0]:
def generate_wordvecs(uu):
# 	unorm = uu / np.sqrt(np.sum(uu*uu, axis=1, keepdims=True))
# 	vnorm = vv / np.sqrt(np.sum(vv*vv, axis=0, keepdims=True))
	#word_vecs = unorm
	#word_vecs = vnorm.T
	word_vecs = uu
	word_vecs_norm = normalize(uu, norm='l2')

	return (word_vecs, word_vecs_norm)

#Extract Similarities

In [0]:
def get_significant_similarities_new(dist_matrix, sep_indx_src, sep_indx_inter, indx2keyword, max_distance=0.3):
	  
  count_l_t = Counter()
  dist_matrix_sq = np.triu(squareform(dist_matrix))
  dists_graph = nx.from_numpy_matrix(dist_matrix_sq)
  
  path_by_dest = {}
  
  for j in range(sep_indx_inter + 1, len(indx2keyword) - 1):
#   for j in range(sep_indx_inter + 1, sep_indx_inter + 10):
    print('Calculating best path for %s/%s' % (j, len(indx2keyword) - 1))
    dist, path = nx.multi_source_dijkstra(dists_graph, set(range(0,sep_indx_src + 1)), j)
    path_by_dest[j] = {'path' : path, 'distance': dist}
  
  return path_by_dest
  
  
 

#Main method

##Preprocessing

In [0]:
stop_stems = ['visual', 'digit', 'human', 'humanit', 'humanist']

In [33]:
dh_papers = json.loads(open('./datasets/dh_papers-complete.json').read())
vis_papers = json.loads(open('./datasets/vispubdata_papers-2018-complete.json').read())
dh_papers_offset = len(dh_papers) - 1
dh_keywords = [t['keywords'] for t in dh_papers]
vis_keywords = [t['keywords'] for t in vis_papers]

all_papers = dh_papers + vis_papers

keywords = dh_keywords + vis_keywords
keywords_flat = list(flatten(keywords))


dh_docs_stemmed, dh_stems_dictionary, dh_stems_dictionary_inv = stem_keywords(dh_keywords)
vis_docs_stemmed, vis_stems_dictionary, vis_stems_dictionary_inv = stem_keywords(vis_keywords)

keywords_stemmed = dh_docs_stemmed + vis_docs_stemmed

stems_dictionary = {}
for key in (dh_stems_dictionary.keys() | vis_stems_dictionary.keys()):
    if key in dh_stems_dictionary: stems_dictionary.setdefault(key, []).append(dh_stems_dictionary[key])
    if key in vis_stems_dictionary: stems_dictionary.setdefault(key, []).append(vis_stems_dictionary[key])


for k,v in stems_dictionary.copy().items():
  stems_dictionary[k] = remove_duplicates(list(flatten(v)))
  
  
stems_dictionary_inv = {**dh_stems_dictionary_inv, **vis_stems_dictionary_inv}



source_keywords_set = remove_duplicates(list(flatten(dh_docs_stemmed)))
dest_keywords_set = remove_duplicates(list(flatten(vis_docs_stemmed)))


intersection_keywords_set = [k for k in source_keywords_set if k in dest_keywords_set]

source_keywords_set = [k for k in source_keywords_set if k not in intersection_keywords_set]
dest_keywords_set = [k for k in dest_keywords_set if k not in intersection_keywords_set]

all_keywords_unique = source_keywords_set + intersection_keywords_set + dest_keywords_set

trace_stem.source_keywords_set = source_keywords_set
trace_stem.intersection_keywords_set = intersection_keywords_set
trace_stem.dest_keywords_set = dest_keywords_set


keyword2indx = {k:v for v,k in enumerate(all_keywords_unique)}
indx2keyword = {indx:keyword for keyword,indx in keyword2indx.items()}


print("There are %s source documents and %s destination documents" % (len(dh_docs_stemmed), len(vis_docs_stemmed)))
print('Vocabulary size is %s: %s source stems, %s intersection stems and %s destination stems' % 
  (len(all_keywords_unique), len(source_keywords_set), len(intersection_keywords_set), len(dest_keywords_set)))


There are 257 source documents and 2123 destination documents
Vocabulary size is 2720: 257 source stems, 320 intersection stems and 2143 destination stems


In [0]:
def trace_color(word, font_size, position, orientation, random_state=None, **kwargs):
  trace = trace_stem(stemmer.stem(word))
  if trace == 'query':
    trace_color = '#d95f02'
  elif trace == 'link':
    trace_color = '#7570b3'
  else:
    trace_color = '#1b9e77'
  return trace_color

## Word embeddings

In [38]:
unigram_counts = build_unigrams(keywords_stemmed)
tf_bin_mat = build_td_mat(keywords_stemmed, keyword2indx)
tf_bin_list = tf_bin_mat.toarray().tolist()

skipgram_counts = build_skipgrams(keywords_stemmed)
wwcnt_mat = build_wordcounts(skipgram_counts, keyword2indx)

#Build ppmi 
sppmi_mat = build_pmis(wwcnt_mat, skipgram_counts, keyword2indx)

uu, ss, vv = svd_reduce(sppmi_mat, 50)
word_vecs, word_vecs_norm = generate_wordvecs(uu)

#use normalized/not normalized
use_vecs = word_vecs_norm

done
vocabulary size: 2720
most common: [('data', 492), ('analysi', 316), ('interact', 299), ('volum', 285), ('render', 269), ('inform', 259), ('analyt', 237), ('model', 190), ('design', 170), ('graph', 145)]
done
number of skipgrams: 84158
most common: [(('volum', 'render'), 207), (('render', 'volum'), 207), (('data', 'analysi'), 104), (('analysi', 'data'), 104), (('interact', 'data'), 60), (('data', 'interact'), 60), (('field', 'vector'), 55), (('vector', 'field'), 55), (('data', 'analyt'), 52), (('analyt', 'data'), 52), (('analyt', 'analysi'), 52), (('analysi', 'analyt'), 52), (('interact', 'analyt'), 51), (('analyt', 'interact'), 51), (('analysi', 'interact'), 48), (('interact', 'analysi'), 48), (('design', 'studi'), 48), (('studi', 'design'), 48), (('interact', 'inform'), 47), (('inform', 'interact'), 47)]
wwcnt sparse matrix was built (2720, 2720)
sparse ppmi matrix was built
Will reduce matrix of shape (2720, 2720) with embedding_size=50


##Distance Matrix

In [26]:
#Generate distance matrix
Y = pdist(use_vecs, 'cosine')


print('Finding synonyms...')
distance_matrix = squareform(Y)
dist_matrix_sq = np.triu(distance_matrix)
syns = {}
remove = []
rows = dist_matrix_sq.shape[0]
cols = dist_matrix_sq.shape[0]
for x in range(0, rows):
  for y in range(0, cols):
    if x >= y:
      continue
    elif dist_matrix_sq[x][y] <= 0.01:
      if x in syns:
        syns[x].append(y)
      else:
        syns[x] = [y]
      if y in syns:
        syns[y].append(x)
      else:
        syns[y] = [x]
      remove.append(y)

keep = []
for i in range(0, rows):
  if i not in remove:
    keep.append(i)

use_vecs_c = use_vecs[keep]

sppmi_mat_c = sppmi_mat[keep,:]
sppmi_mat_c = sppmi_mat_c[:,keep]


print('Found %s synonyms. Operating with %s vectors now' % (len(set(remove)), len(use_vecs_c)))

Y_c = pdist(use_vecs_c, 'cosine')



sep_indx_src = len(source_keywords_set) -1
sep_indx_inter = len(source_keywords_set) + len(intersection_keywords_set) - 1

new_seps = []

prev = 'query'
for i in range(len(use_vecs_c)):
  current = trace_stem(indx2keyword[keep[i]])
  if current != prev:
    new_seps.append(i)
  prev = current
    

sep_indx_src_c = new_seps[0] - 1
sep_indx_inter_c = new_seps[1] - 1
indx2keyword_c = {k:indx2keyword[keep[k]] for k in range(len(use_vecs_c))}


Finding synonyms...
Found 479 synonyms. Operating with 2241 vectors now


In [0]:
keyword2indx_c = {v:k for k,v in indx2keyword_c.items()}

In [28]:
syn_query = []
syn_link = []
syn_target = []

for k,v in indx2keyword_c.items():
  trace = trace_stem(v)
  if trace == 'query':
    syn_query.append(v)
  elif trace == 'link':
    syn_link.append(v)
  else:
    syn_target.append(v)

print(len(syn_query), len(syn_link), len(syn_target))

176 320 1745


##Pathfinding

###Definitions

In [0]:
def generate_paths(dist_matrix, sep_indx_src, sep_indx_inter, indx2keyword):
	  
  dist_matrix_sq = np.triu(squareform(dist_matrix))
  dists_graph = nx.from_numpy_matrix(dist_matrix_sq)
  
  paths_by_dest = {}
  paths_location = './models/all-paths.pkl'
  
  try:
    infile = open(paths_location, 'rb')
    paths_by_dest = pickle.load(infile)
  except FileNotFoundError:      
    for j in range(sep_indx_inter + 1, len(indx2keyword)):
      print('Calculating best path for %s/%s' % (j, len(indx2keyword) - 1))
      dist, path = nx.multi_source_dijkstra(dists_graph, set(range(0,sep_indx_src + 1)), target=j)
      paths_by_dest[j] = {'path' : path, 'distance': dist}
      print(indx2keyword[j], paths_by_dest[j]['distance'], [(indx2keyword[n], trace_stem(indx2keyword[n])) for n in paths_by_dest[j]['path']])

    output = open('paths.pkl', 'wb')
    pickle.dump(paths_by_dest, output)

    shutil.move('paths.pkl', paths_location)

  
  return dists_graph, paths_by_dest
  
  
 

###Main

In [8]:
def sortFn(a):

  cmp_str = ''
  for i in range(len(a)):
    cmp_str += indx2keyword_c[a[i]]

  return cmp_str

def merge_subs(lst_of_lsts):
    copy_list = copy.deepcopy(lst_of_lsts)
    res = []
    for row in copy_list:
        for i, resrow in enumerate(res):
            if row[0]==resrow[0]:
                res[i] += row[1:]
                break
        else:
            res.append(row)
    return res

dists_graph, paths_by_dest = generate_paths(Y_c, sep_indx_src_c, sep_indx_inter_c, indx2keyword_c)


NameError: ignored

In [33]:
all_paths_links = []
for o in paths_by_dest.values():
  print(o['path'])
  for i in range(len(o['path']) - 1):
    all_paths_links.append((o['path'][i], o['path'][i+1]))

[99, 496]
[1, 497]
[49, 279, 918, 498]
[5, 1012, 499]
[144, 424, 503, 500]
[5, 501]
[144, 502]
[144, 424, 503]
[148, 470, 2056, 1485, 504]
[165, 505]
[112, 506]
[151, 507]
[80, 508]
[26, 509]
[33, 510]
[134, 511]
[174, 493, 512]
[33, 513]
[172, 514]
[64, 515]
[64, 515, 516]
[127, 1937, 517]
[21, 260, 1784, 518]
[164, 2145, 519]
[174, 520]
[146, 521]
[1, 181, 1551, 522]
[156, 523]
[49, 279, 1523, 524]
[44, 525]
[26, 526]
[169, 527]
[32, 1964, 528]
[128, 453, 2125, 529]
[144, 424, 503, 530]
[86, 531]
[13, 532]
[154, 473, 533]
[154, 534]
[154, 535]
[21, 260, 1784, 536]
[76, 362, 1082, 780, 537]
[5, 538]
[106, 539]
[13, 540]
[110, 541]
[6, 1268, 1281, 542]
[141, 543]
[174, 544]
[113, 1773, 545]
[6, 208, 1220, 1219, 546]
[93, 400, 547]
[21, 260, 1784, 548]
[151, 549]
[76, 550]
[127, 551]
[6, 208, 1220, 1243, 552]
[127, 553]
[144, 424, 189, 2115, 554]
[103, 555]
[118, 556]
[141, 1926, 558, 557]
[141, 1926, 558]
[64, 669, 559]
[18, 203, 560]
[90, 561]
[64, 562]
[88, 563]
[157, 446, 1195, 564]

In [36]:
all_paths = [v['path'] for k,v in paths_by_dest.items()]

merged_paths_sorted = sorted(merge_subs(all_paths), key=sortFn)

for p in merged_paths_sorted:
  print([(indx2keyword_c[j], trace_stem(indx2keyword_c[j])) for j in (p)])

[('aborigin', 'query'), ('unifi', 'target'), ('unifi', 'target'), ('input', 'target'), ('teleoper', 'target'), ('atom', 'target'), ('echographi', 'target'), ('levels-of-detail', 'target'), ('teleoper', 'target'), ('haptic', 'target'), ('sphere', 'target'), ('cross', 'target'), ('sphere', 'target'), ('terascal', 'target'), ('world-wide-web', 'target'), ('pdm', 'target'), ('wayfind', 'target'), ('shock', 'target'), ('forcefeedback', 'target'), ('projector', 'target'), ('pc', 'target'), ('invers', 'target'), ('head', 'target'), ('surgic', 'target'), ('hysteroscopi', 'target'), ('shear-warp', 'target'), ('speech', 'target'), ('haptic', 'target'), ('6-dof', 'target'), ('invers', 'target'), ('kinemat', 'target'), ('section', 'target'), ('anatom', 'target'), ('section', 'target'), ('carv', 'target'), ('seam', 'target')]
[('academ', 'query'), ('manipul', 'target'), ('contour', 'target'), ('magnif', 'target'), ('data-driven', 'target'), ('data-driven', 'target'), ('data-min', 'target'), ('oracl

In [38]:
merged_paths_dict = {}
for p in merged_paths_sorted:
  merged_paths_dict[indx2keyword_c[p[0]]] = p
  print([('/'.join(stems_dictionary[indx2keyword_c[j]]), trace_stem(indx2keyword_c[j])) for j in sorted(p)])

[('aboriginal', 'query'), ('unified', 'target'), ('unified', 'target'), ('input', 'target'), ('atomic', 'target'), ('echography', 'target'), ('levels-of-detail', 'target'), ('teleoperation', 'target'), ('teleoperation', 'target'), ('haptic/haptics', 'target'), ('haptic/haptics', 'target'), ('crossing', 'target'), ('sphere', 'target'), ('sphere', 'target'), ('terascale', 'target'), ('world-wide-web', 'target'), ('pdm', 'target'), ('wayfinding', 'target'), ('shock', 'target'), ('forcefeedback', 'target'), ('projector/projectors', 'target'), ('pc', 'target'), ('inverse', 'target'), ('inverse', 'target'), ('head', 'target'), ('surgical', 'target'), ('hysteroscopy', 'target'), ('shear-warp', 'target'), ('speech', 'target'), ('6-dof', 'target'), ('kinematics', 'target'), ('anatomic', 'target'), ('sections', 'target'), ('sections', 'target'), ('carving', 'target'), ('seam', 'target')]
[('academic', 'query'), ('writing', 'link'), ('writing', 'link'), ('writing', 'link'), ('writing', 'link'), (

##Plot Paths

###Definitions

In [0]:
def get_recommendations_for_path(component):
  nodes = list(component.nodes())
  token_indices = [keep[n] for n in nodes]
  docs = []
  for doc_index, doc in enumerate(tf_bin_list):
    doc_dict = {}
    doc_tokens = []
    for token_index, token in enumerate(doc):
      if token == True and token_index in token_indices:
        doc_tokens.append(token_index)
    if len(doc_tokens) > 0:
      the_doc = all_papers[doc_index]
      doc_dict['title'] = the_doc['title']
      doc_dict['keywords'] = the_doc['keywords']
      doc_dict['title'] = the_doc['title']
      
      if doc_index <= dh_papers_offset:
        doc_dict['trace'] = 'query'
      else:
        doc_dict['trace'] = 'target'
        
      doc_dict['tokens'] = [indx2keyword[i] for i in doc_tokens]
      doc_dict['token_indices'] = doc_tokens
      doc_dict['token_indices_c'] = [nodes[token_indices.index(i)] for i in doc_tokens]
      docs.append(doc_dict)
  
  return docs

#Vis

In [0]:
def scale_number(unscaled, to_min, to_max, from_min, from_max):
  if from_min == from_max: return to_max
  return (to_max-to_min)*(unscaled-from_min)/(from_max-from_min)+to_min

def plot_keywords_tree_d3(G, indx2keyword, stems_dictionary):
  
  counts = []
  counts.extend(list(map(lambda x: x[1]["count"], list(G.nodes(data=True)))))
  
  
  nodes = G.nodes()
  
  min_dist = 0
  max_dist = 0
  
  for row, data in nx.shortest_path_length(G, weight='weight'):
    for col, dist in data.items():
      if min_dist > dist:
        min_dist = dist
      if max_dist < dist:
        max_dist = dist
  


  df = pd.DataFrame(index=nodes, columns=nodes)
  for row, data in nx.shortest_path_length(G, weight='weight'):
      for col, dist in data.items():
          scaled_dist = scale_number(dist, 1, 4, min_dist, max_dist)
          df.loc[row,col] = scaled_dist

  df = df.fillna(df.max().max())


  pos = nx.kamada_kawai_layout(G, scale=1.0, dim=2, weight='weight', dist=df.to_dict())
  
#   pos = nx.kamada_kawai_layout(G, scale=1.0, dim=2, weight='weight')
  
  x = {k:float('%.4f' % v[0])  for k,v in pos.items()}
  y = {k:float('%.4f' % v[1]) for k,v in pos.items()}
  
  nx.set_node_attributes(G, x, 'x')
  nx.set_node_attributes(G, y, 'y')
  
  graph_json = json_graph.node_link_data(G)
  

#   source = ColumnDataSource({'x': x, 'y': y,
#                              'key': [indx2keyword[l] for l in nodes],
#                              'translation' : [stems_dictionary[indx2keyword[l]][0] for l in nodes],
#                              'trace_color' : [node[1]['trace_color'] for node in G.nodes(data=True)],
#                              'node_font_size' : ['%sem' % (nodes_sizes[k] * 100) for k in nodes]})

#     print(['%sem' % (nodes_sizes[k] * 100) for k in nodes])


#   labels = LabelSet(x='x', 
#                     y='y', 
#                     text='translation', 
#                     source=source,
#                     background_fill_color='white',
# #                       background_fill_alpha=0.8,
#                     text_align='center', 
#                     text_baseline='middle',
# #                       x_offset=5,
# #                       y_offset=15,
#                     text_color='trace_color',
#                     text_font_style='bold',
#                     text_font_size='0.9em')

  docs = get_recommendations_for_path(G)
  node_list = list(nodes)

  docs_json = {}
  for doc in docs:
    doc_keyword_ids = "+".join([str(k) for k in sorted(doc['token_indices_c'])])
    if doc_keyword_ids in docs_json:
      #Doc already added
      docs_json[doc_keyword_ids]['count'] += 1
      docs_json[doc_keyword_ids]['docs'].append(doc)
    else:
      if len(doc['token_indices_c']) == 1:
        cx = pos[doc['token_indices_c'][0]][0]
        cy = pos[doc['token_indices_c'][0]][1]
      elif len(doc['token_indices_c']) == 2:
        ax = pos[doc['token_indices_c'][0]][0]
        ay = pos[doc['token_indices_c'][0]][1]
        bx = pos[doc['token_indices_c'][1]][0]
        by = pos[doc['token_indices_c'][1]][1]
        cx = (ax + bx) / 2
        cy = (ay + by) / 2
      else:
        the_points = [ (pos[i][0], pos[i][1]) for i in doc['token_indices_c']]
        hull = ConvexHull(the_points)
        cx = np.mean(hull.points[hull.vertices,0])
        cy = np.mean(hull.points[hull.vertices,1])
        
      docs_json[doc_keyword_ids] = {
          'cx': cx,
          'cy': cy,
          'count': 1,
          'docs': [doc]
      }
    
    
    
    
    
#     plot.circle(cx, cy, size=25, color="navy", fill_alpha=0.4)
  
  json_obj = {'graph' : graph_json, 'docs' : docs_json}

  return json.dumps(json_obj)




In [0]:
# path_keys = ['co-retweet', 'cooccurr', 'coword']

# path_keys = ['biographi', 'bayerisch', 'jazz']

# path_keys = ['aborigin']

# path_keys = ['palaeographi', 'mediev']

# path_keys = ['drama', 'dramat', 'shakespear']

# path_keys = ['shakespear']

# path_keys = ['rhyme', 'russian', 'transmedia']

path_keys = ['willa', 'racial']

# path_keys = ['women']

# path_keys = ['co-retweet', 'coword']

# path_keys = ['co-retweet', 'coword']

# path_keys = ['coword']


query_indxs = []
sum_path = []
for key in path_keys:
  sum_path += merged_paths_dict[key]
  query_indxs.append(merged_paths_dict[key][0])


path_graph = dists_graph.subgraph(sum_path).copy()


for u,v,d in path_graph.edges(data=True):
  d['sim'] = 1 - d['weight']
  
for i in path_graph.nodes():
  trace = trace_stem(indx2keyword_c[i])
  if trace == 'query':
    trace_color = '#d95f02'
  elif trace == 'link':
    trace_color = '#7570b3'
  else:
    trace_color = '#1b9e77'
  path_graph.nodes[i]['keyword'] = stems_dictionary[indx2keyword_c[i]]
  count = 0

  for k in stems_dictionary[indx2keyword_c[i]]:
    count += unigram_counts[k]
  path_graph.node[i]['count'] = count
  path_graph.nodes[i]['trace_color'] = trace_color
  path_graph.nodes[i]['trace'] = trace
  
T=nx.minimum_spanning_tree(path_graph)
# T=path_graph
json_results = plot_keywords_tree_d3(T, indx2keyword_c, stems_dictionary) 
# print(json_results)


In [6]:
import IPython
import IPython.display as display

display.display(display.Javascript('window.path_data = %s' % json_results))

display.display(IPython.core.display.HTML('''
         <div id='vis'></div>
         <script src="/static/components/requirejs/require.js"></script>
         <script>
          requirejs.config({
            paths: {
              "d3": "https://d3js.org/d3.v5.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        <script>
          requirejs(['d3'], function(d3) {
            const width = 2000,
                  height = 2000;

            const svg = d3.select('#vis').append('svg')
                            .attr('id', 'vis-svg')
                            .attr('width', width)
                            .attr('height', height);
            graph_json = window.path_data
            
            console.log(JSON.stringify(graph_json))
            
            const graph = graph_json['graph'];
            const docs = graph_json['docs'];

            const xScale = d3.scaleLinear().domain(d3.extent(graph.nodes, d=>d.x)).range([140, width-140])
            const yScale = d3.scaleLinear().domain(d3.extent(graph.nodes, d=>d.y)).range([140, height-140])

            const linkline = d3.line()
                                  .x(d => xScale(d.x))
                                  .y(d => yScale(d.y));

            const docs_three = Object.keys(docs).filter(d=>d.split('+').length >= 2).map(d => docs[d])

            const doc_lines_data = []

            docs_three.forEach(doc => {
              doc['docs'].forEach(d => {
                d['token_indices_c'].forEach(t=> {
                  const target_node = graph.nodes.filter(r => r.id == t)[0]
                  const line_points = [
                    {
                      'x' : doc.cx,
                      'y' : doc.cy 
                    },
                    {
                      'x' : target_node.x,
                      'y' : target_node.y
                    }
                  ]
                  doc_lines_data.push(line_points)
                })
              })
            })


            const doc_links = svg.append("g")
                            .attr("id", "doc_links")
                          .selectAll("line")
                          .data(doc_lines_data)
                          .enter().append("path")
                            .attr("stroke-dasharray", '5,5')
                            .style("stroke", "black")
                            .attr("stroke-width", 0.5)
                            .attr("opacity", 0.4)
                            .attr("d", d => linkline(d));


            const links = svg.append("g")
                            .attr("id", "links")
                          .selectAll("line")
                          .data(graph.links)
                          .enter().append("path")
                            .style("stroke", "black")
                            .attr("stroke-width", 0.5)
                            .attr("opacity", 0.4)
                            .attr("d", d => linkline(graph.nodes.filter(b=>b.id == d.source || b.id == d.target)));


            function getBB(selection) {
                selection.each(function(d){d.bbox = this.getBBox();})
            }

            const nodes_g = svg.append("g")
                            .attr("id", "nodes")
                          .selectAll('g')
                          .data(graph.nodes)
                          .enter()
                          .append('g');
            nodes_g
                  .append('text')
                    .attr("class", "node")
                    .text(d=> d.keyword[0])
                    .attr('x', d => xScale(d.x))
                    .attr('y', d => yScale(d.y))
                    .style('fill', d=>d.trace_color)
                    .style('font-size', "14px")
                    .style('font-weight', "bold")
                    .style('text-anchor', "middle")
                    .call(getBB);

            nodes_g.insert("rect", "text")
                              .attr('x', d => xScale(d.x) - d.bbox.width/2)
                              .attr('y', d => yScale(d.y) - 3*(d.bbox.height/4))
                              .attr("width", function(d){return d.bbox.width})
                              .attr("height", function(d){return d.bbox.height})
                              .style("fill", "white")
                              // .style("stroke", "black");

            const all_docs = Object.values(docs)                    

            const docs_g = svg.append("g")
                      .attr("id", "docs")
                    .selectAll('g')
                    .data(all_docs)
                    .enter()
                    .append('g')

            const radiusScale = d3.scaleLinear().domain(d3.extent(all_docs, d=>d.count)).range([3,5])
            docs_g.append('circle')
                .attr('r', d=>radiusScale(d.count))
                .attr('cx', d=>xScale(d.cx))
                .attr('cy', d=>yScale(d.cy))
                .style('fill', d => {
                  console.log(d);
                  if (d['docs'][0].trace == 'query')
                    return '#d95f02'
                  else return '#1b9e77'
                })
                .style('opacity', 0.5);

            docs_g.append('text')
                  .attr('class', 'doc-text')
                  .text(d => d.docs[0].title)
                  .attr('x', d => xScale(d.cx))
                  .attr('y', d => 10 + yScale(d.cy))
                  .style('font-size', "9px")
                  .style('text-anchor', "middle")
                  .style('font-weight', "light")
                  .style('font-family', "Open Sans")
          })
        </script>
      
      '''))

NameError: ignored