In [1]:
import gensim
# load word vectors
word_vectors = gensim.models.Word2Vec.load("word2vec.model")

# create list of projection data
# tuple (label, vector)
labels = word_vectors.wv.index_to_key
vectors = [word_vectors.wv[label] for label in labels]

projection_data = [(label, vector) for label, vector in zip(labels, vectors)] 

In [2]:
import os
import numpy as np
import tensorflow as tf
from tensorboard.plugins import projector

def create_projection(projection_data, path='./tensorboard/'):
    meta_file = 'metadata.tsv'
    samples = len(projection_data)
    vector_dim = len(projection_data[0][1])
    projection_matrix = np.zeros((samples, vector_dim))

    # write meta file with labels, create projection_matrix
    with open(os.path.join(path, meta_file), 'w') as f:
        for i, row in enumerate(projection_data):
            label, vector = row[0], row[1]
            projection_matrix[i] = vector
            f.write(f"{label}\n")

    weights = tf.Variable(
        projection_matrix, trainable=False, name='word_embeddings'
    )

    checkpoint = tf.train.Checkpoint(embedding=weights)
    checkpoint.save(os.path.join(path, "embedding.ckpt"))
    writer = tf.summary.create_file_writer(path)
    
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
    embedding.metadata_path = meta_file
    projector.visualize_embeddings(path, config)

create_projection(projection_data)

2023-08-01 14:12:39.724811: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-01 14:12:39.724869: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-08-01 14:12:52.149665: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-08-01 14:12:52.149761: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-08-01 14:12:52.149812: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (august-OptiPlex-3040): /proc/driver/nvidia/version does not exist
2023-08-01 14:12:52.171719: I tensorflow/core/platform/c

In [3]:
%load_ext tensorboard
%tensorboard --logdir="./tensorboard/"

Reusing TensorBoard on port 6006 (pid 98666), started 0:45:23 ago. (Use '!kill 98666' to kill it.)

In [133]:
%pip install umap
%pip install bokeh
%pip install pandas
%pip install umap-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Category20, Turbo256
import umap
from bokeh.models import Label
from bokeh.models import ColumnDataSource, Label, LabelSet, Range1d
from bokeh.plotting import figure, output_file, show

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()
2023-08-01 15:06:25.585106: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-01 15:06:25.585176: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [6]:
def interactive_plot(umap_embedding_df, words_to_highlight=[], splits=[], fn=None):
    datasource = ColumnDataSource(umap_embedding_df)
    # define color mapping
    palette = []
    if words_to_highlight:
        if not splits:
            l = len(words_to_highlight)
            if 256 > l > 20:
                palette = [Turbo256[i] for i in range(0, 256, int(256/l))][:l]
            elif l <= 20:
                palette = Category20[l]
                words_to_highlight = words_to_highlight[:l]
            else:
                print('too many words to highlight.')
                return
        else:
            colors = Category20[20]
            if len(splits) == 3:
                #                red               blue              green
                colors = [Category20[20][6], Category20[20][0], Category20[20][4]]
            palette = [0] * len(words_to_highlight)
            start = 0
            for i, end in enumerate(splits):
                palette[start:end] = [colors[i]] * (end - start)
                start = end

    print(palette)
    color_mapping = CategoricalColorMapper(factors=words_to_highlight, palette=palette)

    plot_figure = figure(
        title='UMAP projection of word embeddings',
        # plot_width=600,
        # plot_height=600,
        tools=('pan, wheel_zoom, reset')
    )

    plot_figure.add_tools(HoverTool(tooltips="""
    <div>
        <div>
            <span style='font-size: 16px; color: #224499'>Word:</span>
            <span style='font-size: 18px'>@Word</span>
        </div>
    </div>
    """))

    if fn is not None:
        output_file(filename=fn, title='UMAP projection of word embeddings')

    plot_figure.circle(
        'x',
        'y',
        source=datasource,
        color=dict(field='Word', transform=color_mapping),
        line_alpha=0.6,
        fill_alpha=0.6,
        size=7
    )
    if fn is not None:
        save(plot_figure)
    else:
        show(plot_figure)


def generate_umap_embedding(labels, word_embeddings):
    # collapse embeddings to two dimensions
    reducer = umap.UMAP()
    reducer.fit(word_embeddings)
    umap_embedding = reducer.transform(word_embeddings)

    # put data in a data frame
    umap_df = pd.DataFrame(umap_embedding, columns=['x', 'y'])
    umap_df.index = labels
    umap_df.index.name = "Word"

    return umap_df

In [7]:
print(word_vectors)

Word2Vec<vocab=14031, vector_size=150, alpha=0.025>


In [8]:
df = generate_umap_embedding(labels, vectors)


In [9]:
# catalyst_names = ['Cp*Ti(OBz)3','Cp2ZrCl2','Cp2ZrCl','Cp2HfCl2','Cp2TiCl2','EtInd2ZrCl2','(nBuCp)2ZrCl2','Et[Ind]2ZrCl2','Et(Ind)2ZrCl2','(n-BuCp)2ZrCl2','(SBI)ZrMe2', 'Cp2TiMe2','Cp2ZrMe2','Cp2HfMe2','Me2SiInd2ZrCl2','CpZrCl3','CpTiCl3','CpHfCl3','Cl4Ti','Cp*ZrMe3']
# activator_names = ['MAO','TIBA','TEA','TIBAO','MMAO','methylaluminoxane','triethylaluminum','triisobutylaluminum','Et3Al','AlEtCl2','AlEt2Cl','tris(pentafluorophenyl)borane', '[CPh3][B(C6F5)4]','CPh3B(C6F5)4','ethylaluminoxane','tetrachloroaluminate','tri-isobutylaluminum','methyl-aluminoxane','tetrakis(pentafluorophenyl)borane']
# monomer_names = ['propene', 'ethene', '1-butene', '1,7-octadiene', '1-hexene', '1-dodecene', '1-decene', '1-octene']
# keywords = ["polymer", "polymerization", "metallocene", "metallocenes"]
# highlight = []
# for cat in catalyst_names:
#     highlight.append(cat)
# for act in activator_names:
#     highlight.append(act)
# for mon in monomer_names:
#     highlight.append(mon)
# for w in keywords:
#     highlight.append(w)

# df = df[df.Word.isin(highlight)]

# print(df)

In [10]:
catalyst_names = ['Cp*Ti(OBz)3','Cp2ZrCl2','Cp2ZrCl','Cp2HfCl2','Cp2TiCl2','EtInd2ZrCl2','(nBuCp)2ZrCl2','Et[Ind]2ZrCl2','Et(Ind)2ZrCl2','(n-BuCp)2ZrCl2','(SBI)ZrMe2', 'Cp2TiMe2','Cp2ZrMe2','Cp2HfMe2','Me2SiInd2ZrCl2','CpZrCl3','CpTiCl3','CpHfCl3','Cl4Ti','Cp*ZrMe3']
activator_names = ['MAO','TIBA','TEA','TIBAO','MMAO','methylaluminoxane','triethylaluminum','triisobutylaluminum','Et3Al','AlEtCl2','AlEt2Cl','tris(pentafluorophenyl)borane', '[CPh3][B(C6F5)4]','CPh3B(C6F5)4','ethylaluminoxane','tetrachloroaluminate','tri-isobutylaluminum','methyl-aluminoxane','tetrakis(pentafluorophenyl)borane']
monomer_names = ['propene', 'ethene', '1-butene', '1,7-octadiene', '1-hexene', '1-dodecene', '1-decene', '1-octene']
keywords = ["polymer", "polymerization", "metallocene", "metallocenes"]
highlight = [catalyst.lower() for catalyst in catalyst_names] + [activator.lower() for activator in activator_names] + [monomer.lower() for monomer in monomer_names] + keywords

for catalyst in catalyst_names:
    print(catalyst.lower() in df.index)

df = df[df.index.isin(highlight)]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [12]:
%pip install arrange_data
from arrange_data import file_to_list
in_data = np.array([catalyst.lower() in df.index for catalyst in catalyst_names])
in_abstracts = np.array([catalyst in ' '.join(file_to_list("data/abstracts.txt")) for catalyst in catalyst_names])

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement arrange_data (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for arrange_data[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'arrange_data'

In [7]:
interactive_plot(df,highlight,[len(catalyst_names), len(catalyst_names) + len(activator_names), len(catalyst_names) + len(activator_names) + len(monomer_names), len(highlight)], fn='2dvis.html')

In [None]:
df.xs('Cp2ZrCl2'.lower())

In [None]:
df