In [1]:
import torch

from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## 1. Load embeddings

In [2]:
# [[flair_eu, bert_cased_eu, bert_uncased eu, elmo_eu], [flair_en, bert_cased_en, bert_uncased en, elmo_en]]
embeddings_all = [[], []]
embedding_names = ["flair", "bert_cased", "bert_uncased", "elmo"]
#embedding_names = ["bert4c", "bert4u", "bert3c", "bert3u", "bert2c","bert2u", "bert1c", "bert1u"]

In [3]:
# load flair embeddings
embeddings_all[0].append(DocumentPoolEmbeddings([FlairEmbeddings('eu-forward'), FlairEmbeddings('eu-backward')]))
embeddings_all[1].append(DocumentPoolEmbeddings([FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward')]))

In [4]:
# load BERT embeddings

# See BERT paper, section 5.3 and table 7
#bert_layers = list(range(-9, -13, -1))
bert_layers = '-1,-2,-3,-4'
bert_type = 'base' # 'large'

# BERT cased
embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased', layers=bert_layers)]))
embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-cased', layers=bert_layers)]))

# BERT uncased
embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased', layers=bert_layers)]))
embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-uncased', layers=bert_layers)]))

# bert_layer case
'''

for layer in bert_layers:

    # BERT cased
    embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased', layers=str(layer))]))
    embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-cased', layers=str(layer))]))

    # BERT uncased
    embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased', layers=str(layer))]))
    embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-uncased', layers=str(layer))]))
    
'''

2019-07-18 09:41:55,747 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2019-07-18 09:42:17,759 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


"\n\nfor layer in bert_layers:\n\n    # BERT cased\n    embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased', layers=str(layer))]))\n    embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-cased', layers=str(layer))]))\n\n    # BERT uncased\n    embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased', layers=str(layer))]))\n    embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-uncased', layers=str(layer))]))\n    \n"

In [5]:
# load ELMo embeddings
embeddings_all[0].append(DocumentPoolEmbeddings([ELMoEmbeddings(options_file="https://schweter.eu/cloud/eu-elmo/options.json", 
                                                                weight_file="https://schweter.eu/cloud/eu-elmo/weights.hdf5")]))
embeddings_all[1].append(DocumentPoolEmbeddings([ELMoEmbeddings()]))

## 2. Read file and extract sentences

In [6]:
def get_gold_sentences(filename):
    gold_sentences = []
    with open(filename, 'rt') as f_p:
        for line in f_p:
            if line.startswith('"origin"'): # header
                continue
            
            if not line:
                continue
            
            line = line.rstrip()
            line = line.replace('"', '')
            splitted = line.split('\t')
            gold = splitted[0]
            sim_sentences = splitted[1:11]
            
            if gold:
                gold_sentences_simple = {}
                gold_sentences_simple[gold] = sim_sentences
                gold_sentences.append(gold_sentences_simple)
            
    return gold_sentences

def initialize_vectors(sent):
    similarities_all = []
    for i in range(len(sent)):
        similarities_all.append([])

    scores_all = []
    for i in range(len(sent)):
        scores_all.append([])
        
    return similarities_all, scores_all

In [7]:
test_eu = "goldstandard_eu_lexicover.tsv"
test_en = "goldstandard_en_lexicover.tsv"

sent_eu = get_gold_sentences(test_eu)
sent_en = get_gold_sentences(test_en)

similarities_all_eu, scores_all_eu = initialize_vectors(sent_eu)
similarities_all_en, scores_all_en = initialize_vectors(sent_en)

## 3. Calculate similarities

In [8]:
def calculate_similarities(gold, sim_sentences, embeddings):
    
    similarities = []
    query = gold

    q = Sentence(query)
    embeddings.embed(q)
    score = 0
    
    for i in range(len(sim_sentences)):
        
        s = Sentence(sim_sentences[i])
        embeddings.embed(s)

        assert q.embedding.shape == s.embedding.shape
        
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        prox = cos(q.embedding, s.embedding)
        similarities.append(round(prox.item(), 4))
        
        if i > 0 and similarities[i] <= similarities[i-1]:
            score += 1
        
    return similarities, int(score/float(len(sim_sentences))*100)

def calculate(gold_sentences, embeddings, similarities_all, scores_all):
    
    for i in range(len(gold_sentences)):
        
        # obtain gold sentence and similar sentences from global list
        gold = list(gold_sentences[i].keys())[0]
        sim_sentences = gold_sentences[i][gold]
        
        # Calculate similarities for each 'gold' sentence and accumulated score
        similarities, score = calculate_similarities(gold, sim_sentences, embeddings)

        # append current similarity values and score to the global data structures
        scores_all[i].append(score)
        similarities_all[i].append(similarities)

        
    return similarities_all, scores_all

In [9]:
for i in range(len(embeddings_all[0])):
    print(f"Calculating {embedding_names[i]} embeddings for basque")
    similarities_all_eu, scores_all_eu = calculate(sent_eu, embeddings_all[0][i], similarities_all_eu, scores_all_eu)
    print(f"Calculating {embedding_names[i]} embeddings for english")
    similarities_all_en, scores_all_en = calculate(sent_en, embeddings_all[1][i], similarities_all_en, scores_all_en)

Calculating flair embeddings for basque
Calculating flair embeddings for english
Calculating bert_cased embeddings for basque
Calculating bert_cased embeddings for english
Calculating bert_uncased embeddings for basque
Calculating bert_uncased embeddings for english
Calculating elmo embeddings for basque
Calculating elmo embeddings for english


## 4. Plot similarities

In [10]:
fig = plotly.tools.make_subplots(rows=1, cols=10)

def plot_similarities(sent, similarities_all, scores_all):
    origin = list(sent[i].keys())[0]
    # print origin sentence
    print(origin + '\n')
    for j in range(len(sent[i][origin])):
        # print each similar sentence
        print(f"{j}. {sent[i][origin][j]}")
        for k in range(len(embedding_names)):
            # print each similarity value for each variant
            print(f"\t {embedding_names[k]} similarity: {similarities_all[i][k][j]}")
    # print scores for all variants
    print(f"Scores: " + ", ".join(f"{embed}: {scor}%" for embed, scor in zip(embedding_names, scores_all[i])))
    
    # plot similarity heatmap
    trace = go.Heatmap(z=similarities_all[i], y=embedding_names, colorscale='Blues')
    data=[trace]
    fig.append_trace(trace, 1, i+1)

    iplot(data, filename='basic-heatmap' + str(i))

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y2 ]     [ (1,3) x3,y3 ]     [ (1,4) x4,y4 ]     [ (1,5) x5,y5 ]     [ (1,6) x6,y6 ]     [ (1,7) x7,y7 ]     [ (1,8) x8,y8 ]     [ (1,9) x9,y9 ]     [ (1,10) x10,y10 ]



In [11]:
#for i in range(min(len(sent_eu), len(sent_en))):
for i in range(len(sent_en)):
    
    print(f"\nSentence #{i}")
    print("Basque")
    plot_similarities(sent_eu, similarities_all_eu, scores_all_eu)
    print("English")
    plot_similarities(sent_en, similarities_all_en, scores_all_en)


Sentence #0
Basque
medikuak gaixoa bazkaltzera gonbidatu zuen

0. gaixoak medikua bazkaltzera gonbidatu zuen
	 flair similarity: 0.9796
	 bert_cased similarity: 0.9869
	 bert_uncased similarity: 0.987
	 elmo similarity: 0.9863
1. bazkariak medikua gaixo batera gonbidatu zuen
	 flair similarity: 0.8786
	 bert_cased similarity: 0.9737
	 bert_uncased similarity: 0.9681
	 elmo similarity: 0.8772
2. batera zuen gaixoak medikua gonbidatu bazkari
	 flair similarity: 0.7096
	 bert_cased similarity: 0.9458
	 bert_uncased similarity: 0.9579
	 elmo similarity: 0.7741
Scores: flair: 66%, bert_cased: 66%, bert_uncased: 66%, elmo: 66%


English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 flair similarity: 0.9852
	 bert_cased similarity: 0.9965
	 bert_uncased similarity: 0.984
	 elmo similarity: 0.9624
1. the lunch invited the doctor for the patient
	 flair similarity: 0.9427
	 bert_cased similarity: 0.9698
	 bert_uncased similarity: 0.8576
	 elmo similarity: 0.877
2. for invited patient the doctor the lunch
	 flair similarity: 0.8903
	 bert_cased similarity: 0.9317
	 bert_uncased similarity: 0.7869
	 elmo similarity: 0.8204
Scores: flair: 66%, bert_cased: 66%, bert_uncased: 66%, elmo: 66%



Sentence #1
Basque
medikuak gaixoa bazkaltzera gonbidatu zuen

0. gaixoak medikua bazkaltzera gonbidatu zuen
	 flair similarity: 0.9796
	 bert_cased similarity: 0.9869
	 bert_uncased similarity: 0.987
	 elmo similarity: 0.9864
1. medikuak ez zuen gaixoa bazkaltzera gonbidatu
	 flair similarity: 0.7504
	 bert_cased similarity: 0.9613
	 bert_uncased similarity: 0.9657
	 elmo similarity: 0.8514
2. umeak aitona gonbidatu zuen bazkaltzera
	 flair similarity: 0.843
	 bert_cased similarity: 0.9269
	 bert_uncased similarity: 0.902
	 elmo similarity: 0.7864
3. medikuak gaixoa hil zuen
	 flair similarity: 0.7973
	 bert_cased similarity: 0.9383
	 bert_uncased similarity: 0.9464
	 elmo similarity: 0.8315
4. gai hori medikuaren eta gaixoaren artekoa da
	 flair similarity: 0.426
	 bert_cased similarity: 0.8215
	 bert_uncased similarity: 0.8578
	 elmo similarity: 0.5234
5. asko gustatuko litzaidake bazkaltzera gonbidatua izatea
	 flair similarity: 0.6617
	 bert_cased similarity: 0.9131
	 bert_uncase

English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 flair similarity: 0.9852
	 bert_cased similarity: 0.9965
	 bert_uncased similarity: 0.984
	 elmo similarity: 0.9634
1. the doctor did not invite the patient for lunch
	 flair similarity: 0.9295
	 bert_cased similarity: 0.9707
	 bert_uncased similarity: 0.9105
	 elmo similarity: 0.8994
2. the child invited the grandfather for lunch
	 flair similarity: 0.9297
	 bert_cased similarity: 0.9736
	 bert_uncased similarity: 0.8806
	 elmo similarity: 0.8481
3. the doctor killed the patient after lunch
	 flair similarity: 0.9317
	 bert_cased similarity: 0.9433
	 bert_uncased similarity: 0.8406
	 elmo similarity: 0.8121
4. that is a matter between the doctor and the patient
	 flair similarity: 0.7775
	 bert_cased similarity: 0.8978
	 bert_uncased similarity: 0.6924
	 elmo similarity: 0.6282
5. I wish I got invited for lunch
	 flair similarity: 0.738
	 bert_cased similarity: 0.8808
	 bert_uncased similar


Sentence #2
Basque
medikuak pazientea bazkaltzera gonbidatu zuen

0. kirurgialariak pazientea bazkaltzera gonbidatu zuen
	 flair similarity: 0.9623
	 bert_cased similarity: 0.9823
	 bert_uncased similarity: 0.9812
	 elmo similarity: 0.9659
1. medikuak gaixoa bazkaltzera gonbidatu zuen
	 flair similarity: 0.9538
	 bert_cased similarity: 0.9737
	 bert_uncased similarity: 0.9813
	 elmo similarity: 0.9604
2. katedradunak pazientea bazkaltzera gonbidatu zuen
	 flair similarity: 0.9348
	 bert_cased similarity: 0.9667
	 bert_uncased similarity: 0.9598
	 elmo similarity: 0.9293
3. medikuak gaixoa otordu batera gonbidatu zuen
	 flair similarity: 0.8536
	 bert_cased similarity: 0.9579
	 bert_uncased similarity: 0.9593
	 elmo similarity: 0.8885
4. medikuak gaixoa eraman zuen tea hartzera
	 flair similarity: 0.7391
	 bert_cased similarity: 0.9255
	 bert_uncased similarity: 0.9407
	 elmo similarity: 0.792
5. medikuak pazientearen bazkaria ordaindu zuen
	 flair similarity: 0.8423
	 bert_cased simil

English
the doctor invited the patient for lunch

0. the surgeon invited the patient for lunch
	 flair similarity: 0.9853
	 bert_cased similarity: 0.995
	 bert_uncased similarity: 0.9792
	 elmo similarity: 0.9665
1. the doctor invited the doctor for lunch
	 flair similarity: 0.9801
	 bert_cased similarity: 0.9901
	 bert_uncased similarity: 0.9546
	 elmo similarity: 0.9535
2. the professor invited the patient for lunch
	 flair similarity: 0.9783
	 bert_cased similarity: 0.9917
	 bert_uncased similarity: 0.9724
	 elmo similarity: 0.9373
3. the doctor invited the patient for a meal
	 flair similarity: 0.9546
	 bert_cased similarity: 0.9886
	 bert_uncased similarity: 0.9621
	 elmo similarity: 0.9545
4. the doctor took the patient out for tea
	 flair similarity: 0.8749
	 bert_cased similarity: 0.9721
	 bert_uncased similarity: 0.8992
	 elmo similarity: 0.8061
5. the doctor paid for the patient's lunch
	 flair similarity: 0.8895
	 bert_cased similarity: 0.966
	 bert_uncased similarity: 0.885


Sentence #3
Basque
medikuak pazientea bazkaltzera gonbidatu zuen

0. medikuak pazientea bazkaltzera gonbiddatu zuen
	 flair similarity: 0.9763
	 bert_cased similarity: 0.9915
	 bert_uncased similarity: 0.9934
	 elmo similarity: 0.9743
1. medikuak pazientea bazkaltzera patatatu zuen
	 flair similarity: 0.9144
	 bert_cased similarity: 0.9773
	 bert_uncased similarity: 0.9829
	 elmo similarity: 0.9276
2. Stefanek pazientea bazkaltzera gonbidatu zuen
	 flair similarity: 0.9215
	 bert_cased similarity: 0.9535
	 bert_uncased similarity: 0.9582
	 elmo similarity: 0.9174
3. medikuak gaixoa sushira gonbidatu zuen
	 flair similarity: 0.8469
	 bert_cased similarity: 0.9528
	 bert_uncased similarity: 0.9621
	 elmo similarity: 0.8915
4. medikuak patentea bazkaltzera gonbidatu zuen medikuak pazientea jotzera gonbidatu zuen
	 flair similarity: 0.9465
	 bert_cased similarity: 0.9569
	 bert_uncased similarity: 0.9436
	 elmo similarity: 0.9506
Scores: flair: 40%, bert_cased: 60%, bert_uncased: 60%, elm

English
the doctor invited the patient for lunch

0. the doctor invitted the patient for lunch
	 flair similarity: 0.9851
	 bert_cased similarity: 0.9749
	 bert_uncased similarity: 0.9186
	 elmo similarity: 0.9704
1. the doctor kartoffeled the patient for lunch
	 flair similarity: 0.9346
	 bert_cased similarity: 0.9712
	 bert_uncased similarity: 0.918
	 elmo similarity: 0.8896
2. Stefan invited the patient for lunch
	 flair similarity: 0.9444
	 bert_cased similarity: 0.976
	 bert_uncased similarity: 0.936
	 elmo similarity: 0.8437
3. the doctor invited the patient for sushi
	 flair similarity: 0.9544
	 bert_cased similarity: 0.9745
	 bert_uncased similarity: 0.8963
	 elmo similarity: 0.9178
4. the doctor invited the patent for lunch
	 flair similarity: 0.9415
	 bert_cased similarity: 0.9808
	 bert_uncased similarity: 0.9114
	 elmo similarity: 0.8626
5. the doctor invited the patent for linch
	 flair similarity: 0.8963
	 bert_cased similarity: 0.9265
	 bert_uncased similarity: 0.7207
	 

## 5. Calculate total scores

In [12]:
def calculate_total_score(scores_all):
    total_scores = [0] * len(scores_all[0])
    for i in range(len(scores_all)):
        total_scores[0] += scores_all[i][0]
        total_scores[1] += scores_all[i][1]
        total_scores[2] += scores_all[i][2]
        total_scores[3] += scores_all[i][3]
    print(f"Total scores: " + ", ".join(f"{embed}: {scor}" for embed, scor in zip(embedding_names, total_scores)))

In [13]:
#calculate_total_score(scores_all_eu)
calculate_total_score(scores_all_en)

Total scores: flair: 232, bert_cased: 248, bert_uncased: 265, elmo: 248
