In [148]:
import gensim
# load word vectors
word_vectors = gensim.models.Word2Vec.load("word2vec.model")

# create list of projection data
# tuple (label, vector)
labels = word_vectors.wv.index_to_key
vectors = [word_vectors.wv[label] for label in labels]

projection_data = [(label, vector) for label, vector in zip(labels, vectors)]

In [166]:
import os
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

def create_projection(projection_data, path='./tensorboard/'):
    meta_file = 'metadata.tsv'
    samples = len(projection_data)
    vector_dim = len(projection_data[0][1])
    projection_matrix = np.zeros((samples, vector_dim))

    # write meta file with labels, create projection_matrix
    with open(os.path.join(path, meta_file), 'w') as f:
        for i, row in enumerate(projection_data):
            label, vector = row[0], row[1]
            projection_matrix[i] = vector
            f.write(f"{label}\n")

    weights = tf.Variable(
        projection_matrix, trainable=False, name='word_embeddings'
    )

    checkpoint = tf.train.Checkpoint(embedding=weights)
    checkpoint.save(os.path.join(path, "embedding.ckpt"))
    writer = tf.summary.create_file_writer(path)
    
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
    embedding.metadata_path = meta_file
    projector.visualize_embeddings(path, config)

create_projection(projection_data)

In [167]:
%load_ext tensorboard
%tensorboard --logdir="./tensorboard/"

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 14325), started 0:00:04 ago. (Use '!kill 14325' to kill it.)

In [133]:
%pip install umap
%pip install bokeh
%pip install pandas
%pip install umap-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [149]:
import pandas as pd
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Category20, Turbo256
import umap
from bokeh.models import Label
from bokeh.models import ColumnDataSource, Label, LabelSet, Range1d
from bokeh.plotting import figure, output_file, show

In [150]:
def interactive_plot(umap_embedding_df, words_to_highlight=None, splits=None, fn=None):
    datasource = ColumnDataSource(umap_embedding_df)
    # define color mapping
    palette = []
    if words_to_highlight is not None:
        if splits is None:
            l = len(words_to_highlight)
            if 256 > l > 20:
                palette = [Turbo256[i] for i in range(0, 256, int(256/l))][:l]
            elif l <= 20:
                palette = Category20[l]
                words_to_highlight = words_to_highlight[:l]
            else:
                print('too many words to highlight.')
                return
        else:
            colors = Category20[20]
            if len(splits) == 3:
                colors = [Category20[20][6], Category20[20][0], Category20[20][4]]
            palette = []
            start = 0
            for i, end in enumerate(splits):
                palette.extend([colors[i]] * (end - start))
                start = end


    color_mapping = CategoricalColorMapper(factors=words_to_highlight, palette=palette)

    plot_figure = figure(
        title='UMAP projection of word embeddings',
        # plot_width=600,
        # plot_height=600,
        tools=('pan, wheel_zoom, reset')
    )

    plot_figure.add_tools(HoverTool(tooltips="""
    <div>
        <div>
            <span style='font-size: 16px; color: #224499'>Word:</span>
            <span style='font-size: 18px'>@Word</span>
        </div>
    </div>
    """))

    if fn is not None:
        output_file(filename=fn, title='UMAP projection of word embeddings')

    plot_figure.circle(
        'x',
        'y',
        source=datasource,
        color=dict(field='Word', transform=color_mapping),
        line_alpha=0.6,
        fill_alpha=0.6,
        size=7
    )
    if fn is not None:
        save(plot_figure)
        show(plot_figure)
    else:
        show(plot_figure)


def generate_umap_embedding(labels, word_embeddings):

    # collapse embeddings to two dimensions
    reducer = umap.UMAP()
    reducer.fit(word_embeddings)
    umap_embedding = reducer.transform(word_embeddings)

    # save data to csv
    umap_df = pd.DataFrame(umap_embedding, columns=['x', 'y'])
    umap_df['Word'] = labels
    umap_df.index = labels

    return umap_df

In [151]:
print(word_vectors)

Word2Vec<vocab=16927, vector_size=150, alpha=0.025>


In [162]:
df = generate_umap_embedding(labels, vectors)
df

Unnamed: 0,x,y,Word
(,3.234301,2.652190,(
),3.232558,2.471082,)
polymerization,2.711068,1.027165,polymerization
polymer,-0.141094,0.610811,polymer
properties,-1.087538,1.720496,properties
...,...,...,...
expansive,-0.369042,0.043179,expansive
pathology,-0.153758,-0.149799,pathology
dex,0.108974,2.254061,dex
sodium-ion,0.038139,0.061717,sodium-ion


In [163]:
catalyst_names = ['Cp*Ti(OBz)3','Cp2ZrCl2','Cp2ZrCl','Cp2HfCl2','Cp2TiCl2','EtInd2ZrCl2','(nBuCp)2ZrCl2','Et[Ind]2ZrCl2','Et(Ind)2ZrCl2','(n-BuCp)2ZrCl2','(SBI)ZrMe2', 'Cp2TiMe2','Cp2ZrMe2','Cp2HfMe2','Me2SiInd2ZrCl2','CpZrCl3','CpTiCl3','CpHfCl3','Cl4Ti','Cp*ZrMe3']
activator_names = ['MAO','TIBA','TEA','TIBAO','MMAO','methylaluminoxane','triethylaluminum','triisobutylaluminum','Et3Al','AlEtCl2','AlEt2Cl','tris(pentafluorophenyl)borane', '[CPh3][B(C6F5)4]','CPh3B(C6F5)4','ethylaluminoxane','tetrachloroaluminate','tri-isobutylaluminum','methyl-aluminoxane','tetrakis(pentafluorophenyl)borane']
monomer_names = ['propene', 'ethene', '1-butene', '1,7-octadiene', '1-hexene', '1-dodecene', '1-decene', '1-octene']
keywords = ["polymer", "polymerization", "metallocene", "metallocenes"]
highlight = []
for cat in catalyst_names:
    highlight.append(cat)
for act in activator_names:
    highlight.append(act)
for mon in monomer_names:
    highlight.append(mon)
for w in keywords:
    highlight.append(w)

df = df[df.Word.isin(highlight)]

print(df)

                            x         y                 Word
polymerization       2.711068  1.027165       polymerization
polymer             -0.141094  0.610811              polymer
metallocene          3.554819  1.322743          metallocene
methylaluminoxane    3.823787  1.673640    methylaluminoxane
metallocenes         3.746145  1.056605         metallocenes
1-hexene             3.620939  1.581412             1-hexene
propene              3.666639  1.543417              propene
1-octene             3.567669  1.579864             1-octene
ethene               3.502131  1.494786               ethene
1-butene             3.408952  1.714546             1-butene
1-decene             3.585007  1.592730             1-decene
triisobutylaluminum  3.980993  1.562889  triisobutylaluminum
triethylaluminum     4.046002  1.460860     triethylaluminum
1,7-octadiene        2.680674  2.014423        1,7-octadiene


In [164]:
interactive_plot(df,highlight, 
                [len(catalyst_names), len(activator_names) + len(monomer_names), len(keywords)], fn='2dvis')