In [10]:
import h5py
import csv 
import numpy
import os
import torch

In [13]:
os.environ['MAX_LENGTH'] = '384'
os.environ['DATA_PATH'] = 'pubmed_entity_2048.txt'
os.environ['OUTPUT_PATH'] = 'pubmed_entity_2048.h5'
os.environ['BATCH_SIZE'] = '64'
os.environ['MKL_SERVICE_FORCE_INTEL']='1'

In [14]:

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GeForce GTX 1650 with Max-Q Design


In [15]:
!python run_embedding.py \
    --model_name_or_path dmis-lab/biobert-base-cased-v1.1 \
    --max_seq_length  ${MAX_LENGTH} \
    --data_path ${DATA_PATH} \
    --output_path ${OUTPUT_PATH} \
    --batch_size ${BATCH_SIZE} \
    --pooling mean

Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library.
	Try to import numpy first or set the threading layer accordingly. Set MKL_SERVICE_FORCE_INTEL to force it.
04/26/2021 13:00:44 - INFO - utils_embedding -   Creating features from dataset file at pubmed_entity_2048.txt
  0%|                                                  | 0/2048 [00:00<?, ?it/s]04/26/2021 13:00:44 - INFO - utils_embedding -   *** Example ***
04/26/2021 13:00:44 - INFO - utils_embedding -   tokens: [CLS] lo ##hmann selected leg ##horn [SEP]
04/26/2021 13:00:44 - INFO - utils_embedding -   input_ids: 101 25338 28111 2700 3420 9772 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [16]:
embeddings = []

with h5py.File(os.environ['OUTPUT_PATH'], 'r') as f:
        with open(os.environ['DATA_PATH'], 'r') as f_in:
            
            print("The number of keys in h5: {}".format(len(f)))
            for i, input in enumerate(f_in):
                entity_name = input.strip()
                
                embedding = f[entity_name]['embedding'][:]
                         
                embeddings += [embedding]

tensor_name = os.environ['DATA_PATH'].split('.')[0]+".tsv"
numpy.savetxt(tensor_name, embeddings, delimiter="\t")


The number of keys in h5: 2048


In [18]:
from tensorboard.plugins import projector
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [19]:
log_dir='./'
config = projector.ProjectorConfig()
# One can add multiple embeddings.
embedding = config.embeddings.add()
embedding.tensor_name = tensor_name.split('.')[0]
# Link this tensor to its metadata file (e.g. labels).
embedding.tensor_path = tensor_name
embedding.metadata_path = os.environ['DATA_PATH']
projector.visualize_embeddings(log_dir, config)

In [21]:
%tensorboard --logdir .