In [1]:
import torch

from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings, FlairEmbeddings, BertEmbeddings, ELMoEmbeddings

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

## 1. Load embeddings

In [2]:
# [[flair_eu, bert_cased_eu, bert_uncased eu, elmo_eu], [flair_en, bert_cased_en, bert_uncased en, elmo_en]]
embeddings_all = [[], []]
embedding_names = ["flair", "bert_cased", "bert_uncased", "elmo"]
#embedding_names = ["bert4c", "bert4u", "bert3c", "bert3u", "bert2c","bert2u", "bert1c", "bert1u"]

In [3]:
# load flair embeddings
embeddings_all[0].append(DocumentPoolEmbeddings([FlairEmbeddings('eu-forward'), FlairEmbeddings('eu-backward')]))
embeddings_all[1].append(DocumentPoolEmbeddings([FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward')]))

In [15]:
# load BERT embeddings

# See BERT paper, section 5.3 and table 7
#bert_layers = list(range(-9, -13, -1))
bert_layers = '-1,-2,-3,-4'
bert_type = 'base' # 'large'

# BERT cased
embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased', layers=bert_layers)]))
embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-cased', layers=bert_layers)]))

# BERT uncased
embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased', layers=bert_layers)]))
embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-uncased', layers=bert_layers)]))

# bert_layer case
'''

for layer in bert_layers:

    # BERT cased
    embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-cased', layers=str(layer))]))
    embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-cased', layers=str(layer))]))

    # BERT uncased
    embeddings_all[0].append(DocumentPoolEmbeddings([BertEmbeddings('bert-base-multilingual-uncased', layers=str(layer))]))
    embeddings_all[1].append(DocumentPoolEmbeddings([BertEmbeddings('bert-'+bert_type+'-uncased', layers=str(layer))]))
    
'''

2019-07-17 18:37:31,777 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2019-07-17 18:37:57,467 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2019-07-17 18:38:52,903 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2019-07-17 18:39:20,593 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.
2019-07-17 18:40:36,884 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_c

In [5]:
# load ELMo embeddings
embeddings_all[0].append(DocumentPoolEmbeddings([ELMoEmbeddings(options_file="https://schweter.eu/cloud/eu-elmo/options.json", 
                                                                weight_file="https://schweter.eu/cloud/eu-elmo/weights.hdf5")]))
embeddings_all[1].append(DocumentPoolEmbeddings([ELMoEmbeddings()]))

## 2. Read file and extract sentences

In [4]:
def get_gold_sentences(filename):
    gold_sentences = []
    with open(filename, 'rt') as f_p:
        for line in f_p:
            if line.startswith('"origin"'): # header
                continue
            
            if not line:
                continue
            
            line = line.rstrip()
            line = line.replace('"', '')
            splitted = line.split('\t')
            gold = splitted[0]
            sim_sentences = splitted[1:11]
            
            if gold:
                gold_sentences_simple = {}
                gold_sentences_simple[gold] = sim_sentences
                gold_sentences.append(gold_sentences_simple)
            
    return gold_sentences

def initialize_vectors(sent):
    similarities_all = []
    for i in range(len(sent)):
        similarities_all.append([])

    scores_all = []
    for i in range(len(sent)):
        scores_all.append([])
        
    return similarities_all, scores_all

In [5]:
test_eu = "goldstandard_eu_lexicover.tsv"
test_en = "goldstandard_en_lexicover.tsv"

sent_eu = get_gold_sentences(test_eu)
sent_en = get_gold_sentences(test_en)

similarities_all_eu, scores_all_eu = initialize_vectors(sent_eu)
similarities_all_en, scores_all_en = initialize_vectors(sent_en)

## 3. Calculate similarities

In [16]:
def calculate_similarities(gold, sim_sentences, embeddings):
    
    similarities = []
    query = gold

    q = Sentence(query)
    embeddings.embed(q)
    
    print(q)
    print(q.embedding)
    
    score = 0
    
    for i in range(len(sim_sentences)):
        
        s = Sentence(sim_sentences[i])
        embeddings.embed(s)

        assert q.embedding.shape == s.embedding.shape
        
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        prox = cos(q.embedding, s.embedding)
        similarities.append(round(prox.item(), 4))
        
        if i > 0 and similarities[i] <= similarities[i-1]:
            score += 1
        
    return similarities, int(score/float(len(sim_sentences))*100)

def calculate(gold_sentences, embeddings, similarities_all, scores_all):
    
    for i in range(len(gold_sentences)):
        
        # obtain gold sentence and similar sentences from global list
        gold = list(gold_sentences[i].keys())[0]
        sim_sentences = gold_sentences[i][gold]
        
        # Calculate similarities for each 'gold' sentence and accumulated score
        similarities, score = calculate_similarities(gold, sim_sentences, embeddings)

        # append current similarity values and score to the global data structures
        scores_all[i].append(score)
        similarities_all[i].append(similarities)

        
    return similarities_all, scores_al

In [17]:
for i in range(len(embeddings_all[0])):
    print(f"Calculating {embedding_names[i]} embeddings for basque")
    similarities_all_eu, scores_all_eu = calculate(sent_eu, embeddings_all[0][i], similarities_all_eu, scores_all_eu)
    print(f"Calculating {embedding_names[i]} embeddings for english")
    similarities_all_en, scores_all_en = calculate(sent_en, embeddings_all[1][i], similarities_all_en, scores_all_en)

Calculating flair embeddings for basque
Sentence: "medikuak gaixoa bazkaltzera gonbidatu zuen" - 5 Tokens
tensor([-0.0302,  0.3141,  0.0454,  ..., -0.0182,  0.0214,  0.1127],
       grad_fn=<CatBackward>)
Sentence: "gaixoak medikua bazkaltzera gonbidatu zuen" - 5 Tokens
tensor([-0.0405,  0.3071,  0.0508,  ..., -0.0208,  0.0203,  0.0724],
       grad_fn=<CatBackward>)


TypeError: cannot unpack non-iterable NoneType object

## 4. Plot similarities

In [11]:
fig = plotly.tools.make_subplots(rows=1, cols=10)

def plot_similarities(sent, similarities_all, scores_all):
    origin = list(sent[i].keys())[0]
    # print origin sentence
    print(origin + '\n')
    for j in range(len(sent[i][origin])):
        # print each similar sentence
        print(f"{j}. {sent[i][origin][j]}")
        for k in range(len(embedding_names)):
            # print each similarity value for each variant
            print(f"\t {embedding_names[k]} similarity: {similarities_all[i][k][j]}")
    # print scores for all variants
    print(f"Scores: " + ", ".join(f"{embed}: {scor}%" for embed, scor in zip(embedding_names, scores_all[i])))
    
    # plot similarity heatmap
    trace = go.Heatmap(z=similarities_all[i], y=embedding_names, colorscale='Blues')
    data=[trace]
    fig.append_trace(trace, 1, i+1)

    iplot(data, filename='basic-heatmap' + str(i))

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y2 ]     [ (1,3) x3,y3 ]     [ (1,4) x4,y4 ]     [ (1,5) x5,y5 ]     [ (1,6) x6,y6 ]     [ (1,7) x7,y7 ]     [ (1,8) x8,y8 ]     [ (1,9) x9,y9 ]     [ (1,10) x10,y10 ]



In [18]:
#for i in range(min(len(sent_eu), len(sent_en))):
for i in range(len(sent_en)):
    
    print(f"\nSentence #{i}")
    print("Basque")
    plot_similarities(sent_eu, similarities_all_eu, scores_all_eu)
    print("English")
    plot_similarities(sent_en, similarities_all_en, scores_all_en)


Sentence #0
English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 bert4c similarity: 0.9982
	 bert4u similarity: 0.9962
	 bert3c similarity: 0.9988
	 bert3u similarity: 0.9977
	 bert2c similarity: 0.9992
	 bert2u similarity: 0.9987
	 bert1c similarity: 0.9998
	 bert1u similarity: 0.9989
1. the lunch invited the doctor for the patient
	 bert4c similarity: 0.984
	 bert4u similarity: 0.9732
	 bert3c similarity: 0.986
	 bert3u similarity: 0.9821
	 bert2c similarity: 0.9887
	 bert2u similarity: 0.9882
	 bert1c similarity: 0.9897
	 bert1u similarity: 0.9877
2. for invited patient the doctor the lunch
	 bert4c similarity: 0.9607
	 bert4u similarity: 0.9111
	 bert3c similarity: 0.9765
	 bert3u similarity: 0.9556
	 bert2c similarity: 0.9907
	 bert2u similarity: 0.9848
	 bert1c similarity: 0.999
	 bert1u similarity: 0.9946
Scores: bert4c: 66%, bert4u: 66%, bert3c: 66%, bert3u: 66%, bert2c: 33%, bert2u: 66%, bert1c: 33%, bert1u: 33%



Sentence #1
English
the doctor invited the patient for lunch

0. the patient invited the doctor for lunch
	 bert4c similarity: 0.9982
	 bert4u similarity: 0.9962
	 bert3c similarity: 0.9988
	 bert3u similarity: 0.9977
	 bert2c similarity: 0.9992
	 bert2u similarity: 0.9987
	 bert1c similarity: 0.9998
	 bert1u similarity: 0.9989
1. the doctor did not invite the patient for lunch
	 bert4c similarity: 0.9567
	 bert4u similarity: 0.9373
	 bert3c similarity: 0.9485
	 bert3u similarity: 0.9249
	 bert2c similarity: 0.9436
	 bert2u similarity: 0.9277
	 bert1c similarity: 0.9323
	 bert1u similarity: 0.9036
2. the child invited the grandfather for lunch
	 bert4c similarity: 0.9268
	 bert4u similarity: 0.9024
	 bert3c similarity: 0.8998
	 bert3u similarity: 0.8626
	 bert2c similarity: 0.8708
	 bert2u similarity: 0.8421
	 bert1c similarity: 0.8292
	 bert1u similarity: 0.7853
3. the doctor killed the patient after lunch
	 bert4c similarity: 0.9343
	 bert4u similarity: 0.9087
	 bert3c similarity: 0


Sentence #2
English
the doctor invited the patient for lunch

0. the surgeon invited the patient for lunch
	 bert4c similarity: 0.9842
	 bert4u similarity: 0.9699
	 bert3c similarity: 0.9777
	 bert3u similarity: 0.9622
	 bert2c similarity: 0.9706
	 bert2u similarity: 0.9514
	 bert1c similarity: 0.9566
	 bert1u similarity: 0.9358
1. the doctor invited the doctor for lunch
	 bert4c similarity: 0.9747
	 bert4u similarity: 0.9502
	 bert3c similarity: 0.9662
	 bert3u similarity: 0.9436
	 bert2c similarity: 0.9619
	 bert2u similarity: 0.9441
	 bert1c similarity: 0.9511
	 bert1u similarity: 0.9248
2. the professor invited the patient for lunch
	 bert4c similarity: 0.974
	 bert4u similarity: 0.9627
	 bert3c similarity: 0.9653
	 bert3u similarity: 0.9519
	 bert2c similarity: 0.9552
	 bert2u similarity: 0.9442
	 bert1c similarity: 0.9391
	 bert1u similarity: 0.9289
3. the doctor invited the patient for a meal
	 bert4c similarity: 0.9742
	 bert4u similarity: 0.9578
	 bert3c similarity: 0.9635
	 


Sentence #3
English
the doctor invited the patient for lunch

0. the doctor invitted the patient for lunch
	 bert4c similarity: 0.9619
	 bert4u similarity: 0.9288
	 bert3c similarity: 0.9474
	 bert3u similarity: 0.9193
	 bert2c similarity: 0.944
	 bert2u similarity: 0.9171
	 bert1c similarity: 0.9229
	 bert1u similarity: 0.8857
1. the doctor kartoffeled the patient for lunch
	 bert4c similarity: 0.9432
	 bert4u similarity: 0.9151
	 bert3c similarity: 0.9207
	 bert3u similarity: 0.8927
	 bert2c similarity: 0.912
	 bert2u similarity: 0.884
	 bert1c similarity: 0.8813
	 bert1u similarity: 0.8494
2. Stefan invited the patient for lunch
	 bert4c similarity: 0.9436
	 bert4u similarity: 0.92
	 bert3c similarity: 0.9309
	 bert3u similarity: 0.9095
	 bert2c similarity: 0.9228
	 bert2u similarity: 0.9042
	 bert1c similarity: 0.9008
	 bert1u similarity: 0.8827
3. the doctor invited the patient for sushi
	 bert4c similarity: 0.9481
	 bert4u similarity: 0.9329
	 bert3c similarity: 0.9266
	 bert3u 

## 5. Calculate total scores

In [13]:
def calculate_total_score(scores_all):
    total_scores = [0] * len(scores_all[0])
    for i in range(len(scores_all)):
        total_scores[0] += scores_all[i][0]
        total_scores[1] += scores_all[i][1]
        total_scores[2] += scores_all[i][2]
        total_scores[3] += scores_all[i][3]
    print(f"Total scores: " + ", ".join(f"{embed}: {scor}" for embed, scor in zip(embedding_names, total_scores)))

In [14]:
#calculate_total_score(scores_all_eu)
calculate_total_score(scores_all_en)

Total scores: bert8c: 231, bert8u: 264, bert7c: 231, bert7u: 264, bert6c: 0, bert6u: 0, bert6c: 0, bert6u: 0
