In [None]:
import torch

from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings, FlairEmbeddings, BertEmbeddings

In [None]:
gold_filename = "goldstandard_baskisch.csv"

def get_gold_sentences(filename):
    gold_sentences = {}
    with open(filename, 'rt') as f_p:
        for line in f_p:
            if line.startswith('"origin"'): # header
                continue
            
            if not line:
                continue
            
            line = line.rstrip()

            line = line.replace('"', '')
            
            splitted = line.split('\t')
            
            gold = splitted[0]
            
            sim_sentences = splitted[1:11]
            
            if gold:
                gold_sentences[gold] = sim_sentences
            
    return gold_sentences



In [None]:
def calculate_similarities(gold, sim_sentences, embeddings):
    
    similarities = []
    
    for sentence in sim_sentences:
        query = gold

        q = Sentence(query)
        embeddings.embed(q)
        
        s = Sentence(sentence)
        embeddings.embed(s)

        assert q.embedding.shape == s.embedding.shape
        
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        prox = cos(q.embedding, s.embedding)
    
        similarities.append(round(prox.item(), 4))
        
    return similarities

def print_similarities(gold, sim_sentences, similarities):
        
    print(f"Example: {gold}")
    
    for sentence, similarity in zip(sim_sentences, similarities) :
        print(f"{sentence} - {similarity}")
    
    print("")
    

In [None]:
# Change code here ;)

gold_sentences = get_gold_sentences(gold_filename)

flair_embeddings = DocumentPoolEmbeddings([FlairEmbeddings('eu-forward'), 
                                     FlairEmbeddings('eu-backward')
                                    ])

# See BERT paper, section 5.3 and table 7
bert_layers = '-1,-2,-3,-4'

bert_cased_embeddings = DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased',
                                                               layers=bert_layers)])

bert_uncased_embeddings = DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased',
                                                                 layers=bert_layers)])

#ELMo and ELMo Transformer are coming soon ;)

for gold, sim_sentences in gold_sentences.items():
    similarities = calculate_similarities(gold, sim_sentences, bert_uncased_embeddings)
    
    print_similarities(gold, sim_sentences, similarities)