In [1]:
import torch

from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## 1. Load embeddings

In [2]:
# [[flair_eu, bert_cased_eu, bert_uncased eu, elmo_eu], [flair_en, bert_cased_en, bert_uncased en, elmo_en]]
embeddings_all = [[], []]
embedding_names = ["flair", "bert_cased", "bert_uncased", "elmo"]

In [3]:
# load flair embeddings
embeddings_all[0].append(DocumentPoolEmbeddings([FlairEmbeddings('eu-forward'), FlairEmbeddings('eu-backward')]))
embeddings_all[1].append(DocumentPoolEmbeddings([FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward')]))

In [4]:
# load BERT embeddings

# See BERT paper, section 5.3 and table 7
bert_layers = '-1,-2,-3,-4'
bert_type = 'base' # 'large'

# BERT cased
embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased', layers=bert_layers)]))
embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-cased', layers=bert_layers)]))

# BERT uncased
embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased', layers=bert_layers)]))
embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-uncased', layers=bert_layers)]))

2019-07-10 12:20:39,926 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2019-07-10 12:20:55,764 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [5]:
# load ELMo embeddings
embeddings_all[0].append(DocumentPoolEmbeddings([ELMoEmbeddings(options_file="https://schweter.eu/cloud/eu-elmo/options.json", 
                                                                weight_file="https://schweter.eu/cloud/eu-elmo/weights.hdf5")]))
embeddings_all[1].append(DocumentPoolEmbeddings([ELMoEmbeddings()]))

## 2. Read file and extract sentences

In [6]:
test_eu = "goldstandard_eu_lexicover.tsv"
test_en = "goldstandard_en_lexicover.tsv"

def get_gold_sentences(filename):
    gold_sentences = []
    with open(filename, 'rt') as f_p:
        for line in f_p:
            if line.startswith('"origin"'): # header
                continue
            
            if not line:
                continue
            
            line = line.rstrip()
            line = line.replace('"', '')
            splitted = line.split('\t')
            gold = splitted[0]
            sim_sentences = splitted[1:11]
            
            if gold:
                gold_sentences_simple = {}
                gold_sentences_simple[gold] = sim_sentences
                gold_sentences.append(gold_sentences_simple)
            
    return gold_sentences

In [13]:
sent_eu = get_gold_sentences(test_eu)
sent_en = get_gold_sentences(test_en)

In [8]:
def initialize_vectors(sent):
    similarities_all = []
    for i in range(len(sent)):
        similarities_all.append([])

    scores_all = []
    for i in range(len(sent)):
        scores_all.append([])
        
    return similarities_all, scores_all

In [14]:
similarities_all_eu, scores_all_eu = initialize_vectors(sent_eu)
similarities_all_en, scores_all_en = initialize_vectors(sent_en)

## 3. Calculate similarities

In [10]:
def calculate_similarities(gold, sim_sentences, embeddings):
    
    similarities = []
    query = gold

    q = Sentence(query)
    embeddings.embed(q)
    score = 0
    
    for i in range(len(sim_sentences)):
        
        s = Sentence(sim_sentences[i])
        embeddings.embed(s)

        assert q.embedding.shape == s.embedding.shape
        
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        prox = cos(q.embedding, s.embedding)
        similarities.append(round(prox.item(), 4))
        
        if i > 0 and similarities[i] <= similarities[i-1]:
            score += 1
        
    return similarities, score

def calculate(gold_sentences, embeddings, similarities_all, scores_all):
    for i in range(len(gold_sentences)):
        
        # obtain gold sentence and similar sentences from global list
        gold = list(gold_sentences[i].keys())[0]
        sim_sentences = gold_sentences[i][gold]
        
        # Calculate similarities for each 'gold' sentence and accumulated score
        similarities, score = calculate_similarities(gold, sim_sentences, embeddings)

        # append current similarity values and score to the global data structures
        scores_all[i].append(score)
        similarities_all[i].append(similarities)
        
    return similarities_all, scores_all

In [15]:
for i in range(len(embeddings_all[0])):
    print(f"Calculating {embedding_names[i]} embeddings for basque")
    similarities_all_eu, scores_all_eu = calculate(sent_eu, embeddings_all[0][i], similarities_all_eu, scores_all_eu)
    print(f"Calculating {embedding_names[i]} embeddings for english")
    similarities_all_en, scores_all_en = calculate(sent_en, embeddings_all[1][i], similarities_all_en, scores_all_en)

Calculating flair embeddings for basque
Calculating flair embeddings for english
Calculating bert_cased embeddings for basque
Calculating bert_cased embeddings for english
Calculating bert_uncased embeddings for basque
Calculating bert_uncased embeddings for english
Calculating elmo embeddings for basque
Calculating elmo embeddings for english


## 4. Plot similarities

In [16]:
fig = plotly.tools.make_subplots(rows=1, cols=10)

def plot_similarities(sent, similarities_all, scores_all):
    origin = list(sent[i].keys())[0]
    # print origin sentence
    print(origin + '\n')
    for j in range(len(sent[i][origin])):
        # print each similar sentence
        print(f"{j}. {sent[i][origin][j]}")
        for k in range(len(embedding_names)):
            # print each similarity value for each variant
            print(f"\t {embedding_names[k]} similarity: {similarities_all[i][k][j]}")
    # print scores for all variants
    print(f"Scores: " + ", ".join(f"{embed}: {scor}" for embed, scor in zip(embedding_names, scores_all[i])))
    
    # plot similarity heatmap
    trace = go.Heatmap(z=similarities_all[i], y=embedding_names, colorscale='Blues')
    data=[trace]
    fig.append_trace(trace, 1, i+1)
    iplot(data, filename='basic-heatmap' + str(i))

for i in range(len(sent_eu)):
    
    print(f"Sentence #{i}")
    print("Basque")
    plot_similarities(sent_eu, similarities_all_eu, scores_all_eu)
    print("English")
    plot_similarities(sent_en, similarities_all_en, scores_all_en)

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y2 ]     [ (1,3) x3,y3 ]     [ (1,4) x4,y4 ]     [ (1,5) x5,y5 ]     [ (1,6) x6,y6 ]     [ (1,7) x7,y7 ]     [ (1,8) x8,y8 ]     [ (1,9) x9,y9 ]     [ (1,10) x10,y10 ]

Sentence #0
Basque
medikuak gaixoa bazkaltzera gonbidatu zuen

0. gaixoak medikua bazkaltzera gonbidatu zuen
	 flair similarity: 0.9796
	 bert_cased similarity: 0.9869
	 bert_uncased similarity: 0.987
	 elmo similarity: 0.9864
1. bazkariak medikua gaixo batera gonbidatu zuen
	 flair similarity: 0.8786
	 bert_cased similarity: 0.9737
	 bert_uncased similarity: 0.9681
	 elmo similarity: 0.8772
2. batera zuen gaixoak medikua gonbidatu bazkari
	 flair similarity: 0.7096
	 bert_cased similarity: 0.9458
	 bert_uncased similarity: 0.9579
	 elmo similarity: 0.7741
Scores: flair: 2, bert_cased: 2, bert_uncased: 2, elmo: 2


English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 flair similarity: 0.9852
	 bert_cased similarity: 0.9965
	 bert_uncased similarity: 0.984
	 elmo similarity: 0.962
1. the lunch invited the doctor for the patient
	 flair similarity: 0.9427
	 bert_cased similarity: 0.9698
	 bert_uncased similarity: 0.8576
	 elmo similarity: 0.8774
2. for invited patient the doctor the lunch
	 flair similarity: 0.8903
	 bert_cased similarity: 0.9317
	 bert_uncased similarity: 0.7869
	 elmo similarity: 0.8197
Scores: flair: 2, bert_cased: 2, bert_uncased: 2, elmo: 2


Sentence #1
Basque
medikuak gaixoa bazkaltzera gonbidatu zuen

0. gaixoak medikua bazkaltzera gonbidatu zuen
	 flair similarity: 0.9796
	 bert_cased similarity: 0.9869
	 bert_uncased similarity: 0.987
	 elmo similarity: 0.9864
1. medikuak ez zuen gaixoa bazkaltzera gonbidatu
	 flair similarity: 0.7504
	 bert_cased similarity: 0.9613
	 bert_uncased similarity: 0.9657
	 elmo similarity: 0.8515
2. umeak aitona gonbidatu zuen bazkaltzera
	 flair similarity: 0.843
	 bert_cased similarity: 0.9269
	 bert_uncased similarity: 0.902
	 elmo similarity: 0.7864
3. medikuak gaixoari esan zion, fraude bat zela
	 flair similarity: 0.6556
	 bert_cased similarity: 0.9148
	 bert_uncased similarity: 0.9284
	 elmo similarity: 0.7262
4. gai hori medikuaren eta gaixoaren artekoa da
	 flair similarity: 0.426
	 bert_cased similarity: 0.8215
	 bert_uncased similarity: 0.8578
	 elmo similarity: 0.5235
5. umeak eta aitona bazkaltzera gonbidatuak izan ziren
	 flair similarity: 0.7815
	 bert_cased similarity: 0.917

English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 flair similarity: 0.9852
	 bert_cased similarity: 0.9965
	 bert_uncased similarity: 0.984
	 elmo similarity: 0.9634
1. the doctor did not invite the patient for lunch
	 flair similarity: 0.9295
	 bert_cased similarity: 0.9707
	 bert_uncased similarity: 0.9105
	 elmo similarity: 0.8994
2. the child invited the grandfather for lunch
	 flair similarity: 0.9297
	 bert_cased similarity: 0.9736
	 bert_uncased similarity: 0.8806
	 elmo similarity: 0.8481
3. the doctor told the patient he was a fraud
	 flair similarity: 0.8371
	 bert_cased similarity: 0.9349
	 bert_uncased similarity: 0.8181
	 elmo similarity: 0.7105
4. that is a matter between the doctor and the patient
	 flair similarity: 0.7775
	 bert_cased similarity: 0.8978
	 bert_uncased similarity: 0.6924
	 elmo similarity: 0.6286
5. the child and the grandfather got invited for lunch
	 flair similarity: 0.8854
	 bert_cased similarity: 0.9584

Sentence #2
Basque
Medikuak gaixoa bazkaltzera gonbidatu zuen

0. Medikuak gaixoa afari batera gonbidatu zuen
	 flair similarity: 0.888
	 bert_cased similarity: 0.9626
	 bert_uncased similarity: 0.9691
	 elmo similarity: 0.9184
1. Medikuak gaixoa bazkaltzera gonbidatu du
	 flair similarity: 0.9254
	 bert_cased similarity: 0.969
	 bert_uncased similarity: 0.9498
	 elmo similarity: 0.9669
2. Medikuak gaixoari esan zion, sendatuko zela
	 flair similarity: 0.6711
	 bert_cased similarity: 0.9261
	 bert_uncased similarity: 0.9367
	 elmo similarity: 0.7402
3. Irakasleak ikaslea bazkaltzera gonbidatu zuen
	 flair similarity: 0.8798
	 bert_cased similarity: 0.9415
	 bert_uncased similarity: 0.9275
	 elmo similarity: 0.8622
4. Suhiltzaileak zientzialaria bazkaltzera gonbidatu zuen
	 flair similarity: 0.8834
	 bert_cased similarity: 0.8932
	 bert_uncased similarity: 0.944
	 elmo similarity: 0.8883
5. Medikuak ez zuen gaixoa bazkaltzera gonbidatu
	 flair similarity: 0.7508
	 bert_cased similarity:

English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 flair similarity: 0.9852
	 bert_cased similarity: 0.9965
	 bert_uncased similarity: 0.984
	 elmo similarity: 0.9621
1. the doctor invited the patient for dinner
	 flair similarity: 0.979
	 bert_cased similarity: 0.9943
	 bert_uncased similarity: 0.9781
	 elmo similarity: 0.983
2. the doctor has invited the patient for lunch
	 flair similarity: 0.9799
	 bert_cased similarity: 0.9846
	 bert_uncased similarity: 0.9366
	 elmo similarity: 0.9655
3. the patient did not invite the doctor for lunch
	 flair similarity: 0.9127
	 bert_cased similarity: 0.9667
	 bert_uncased similarity: 0.8959
	 elmo similarity: 0.8667
4. the doctor examined the patient before lunch
	 flair similarity: 0.9134
	 bert_cased similarity: 0.9579
	 bert_uncased similarity: 0.8866
	 elmo similarity: 0.8298
5. the doctor thought of inviting the patient for lunch
	 flair similarity: 0.9302
	 bert_cased similarity: 0.9763
	 bert_

## 5. Calculate total scores

In [13]:
def calculate_total_score(scores_all):
    total_scores = [0] * len(scores_all[0])
    for i in range(len(scores_all)):
        total_scores[0] += scores_all[i][0]
        total_scores[1] += scores_all[i][1]
        total_scores[2] += scores_all[i][2]
        total_scores[3] += scores_all[i][3]
    print(f"Total scores: " + ", ".join(f"{embed}: {scor}" for embed, scor in zip(embedding_names, total_scores)))

In [14]:
calculate_total_score(scores_all_eu)
calculate_total_score(scores_all_en)

Total scores: flair: 6, bert_cased: 5, bert_uncased: 4, elmo: 6
Total scores: flair: 5, bert_cased: 6, bert_uncased: 6, elmo: 6
