In [1]:
from abc import ABC, abstractmethod
from flair.embeddings import ELMoEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence
from gensim.models import KeyedVectors

import csv
import flair
import fasttext
import torch

# Use CPU if GPU is not available
flair.device = torch.device('cpu')

In [2]:
ELMO_MODEL_W = './embeddings/elmo/bioreddit.hdf5'
ELMO_MODEL_O = './embeddings/elmo/options.json'
GLOVE_MODEL = './embeddings/glove/bioreddit.200.gensim'
FASTTEXT_MODEL = './embeddings/fasttext/reddit-biomed.bin'
FLAIR_MODEL_F =  './embeddings/flair/forward.pt'
FLAIR_MODEL_B = './embeddings/flair/backward.pt'

In [3]:
# Generic class to represent a model
class ModelLoader(ABC):

    def __init__(self,model_name):
        self.model_name = model_name
        self.model = None
        
    @abstractmethod
    def load_model(self, model_path):
        pass
    
    @abstractmethod
    def get_vector(self, text):
        pass

In [4]:
class FastTextEmbedder(ModelLoader):
    
    def __init__(self):
        super().__init__('fasttext')
        
    def load_model(self,model_path):
        self.model = fasttext.load_model(model_path)
        
    def get_vector(self,text):
        return self.model[text]

In [5]:
class GloVeEmbedder(ModelLoader):
    
    def __init__(self):
        super().__init__('glove')
        
    def load_model(self,model_path):
        self.model = KeyedVectors.load(model_path,mmap='r')

        
    def get_vector(self,text):
        return self.model[text]
        

In [6]:
class FlairEmbedder(ModelLoader):
    def __init__(self):
        super().__init__('flair')

    def load_model(self, model_path):
        flair_embedding_forward = FlairEmbeddings(model_path[0])
        flair_embedding_backward = FlairEmbeddings(model_path[1])

        self.model = DocumentPoolEmbeddings(
            [flair_embedding_backward, flair_embedding_forward])

    def get_vector(self, text):
        s = Sentence(text)
        self.model.embed(s)
        return s.get_embedding()

In [7]:
class ELMoEmbedder(ModelLoader):
    def __init__(self):
        super().__init__('flair')

    def load_model(self, model_path):
        elmo_embeddings = ELMoEmbeddings(weight_file=model_path[0],
                                    options_file=model_path[1])
        
        self.model = DocumentPoolEmbeddings(
            [elmo_embeddings])

    def get_vector(self, text):
        s = Sentence(text)
        self.model.embed(s)
        return s.get_embedding()

In [8]:
models = [(FastTextEmbedder, FASTTEXT_MODEL), 
          (GloVeEmbedder, GLOVE_MODEL),
          (FlairEmbedder, [
              FLAIR_MODEL_F,
              FLAIR_MODEL_B,
          ]),
         (ELMoEmbedder, [
              ELMO_MODEL_W,
              ELMO_MODEL_O,
          ])]

In [9]:
for model_type, model_path in models:
    model = model_type()
    model.load_model(model_path)
    print(model.model_name, end='\n***\n')
    print(model.get_vector('cancer')[:5])




fasttext
***
[-0.30783233 -0.21036296 -0.12321309  0.07858984 -0.16739686]
glove
***
[-0.772139  0.560792  0.598145 -0.912497  0.001729]
flair
***
tensor([ 0.2916, -0.0142,  0.0019, -0.1679,  0.2879], grad_fn=<SliceBackward>)
flair
***
tensor([-0.0947, -0.0650, -0.8767, -0.0830, -0.4786], grad_fn=<SliceBackward>)


In [10]:
# Example: Elmo embeddings

elmo = ELMoEmbedder()
elmo.load_model([ELMO_MODEL_W, ELMO_MODEL_O])

with open('./snomed_mappings.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for row in list(reader)[:50]: # print the first 50 rows only
        print(row[0])
        print(model.get_vector(row[0]))

11-beta-hydroxylase deficiency
tensor([-0.2940, -0.5838, -0.0213,  ..., -0.1921, -0.1590,  0.4487],
       grad_fn=<CatBackward>)
11β-OHD
tensor([-1.2703, -0.3492,  0.2137,  ..., -0.1244, -0.8361,  0.4300],
       grad_fn=<CatBackward>)
1°HPT
tensor([-1.4337, -0.4270,  0.1452,  ..., -0.0409, -1.0152,  0.4035],
       grad_fn=<CatBackward>)
21-hydroxylase deficiency
tensor([-0.1441, -0.4978, -0.0346,  ..., -0.2250, -0.3382,  0.4941],
       grad_fn=<CatBackward>)
21-hydroxylase deficiency
tensor([-0.1441, -0.4978, -0.0346,  ..., -0.2934, -0.3345,  0.4075],
       grad_fn=<CatBackward>)
21-OHD
tensor([-9.7342e-01, -4.6498e-01,  1.7340e-01,  ...,  2.0582e-02,
        -3.4103e-01,  4.1576e-04], grad_fn=<CatBackward>)
21-OHD
tensor([-0.9734, -0.4650,  0.1734,  ...,  0.0499, -0.3469, -0.0108],
       grad_fn=<CatBackward>)
A1ATD
tensor([-0.8987, -0.4080,  0.3829,  ..., -0.0452,  0.2464,  0.2301],
       grad_fn=<CatBackward>)
AA amyloidosis
tensor([ 0.2633, -0.3631, -0.8415,  ...,  0.3193,  