In [1]:
import torch

from flair.data import Sentence
from flair.embeddings import DocumentPoolEmbeddings, FlairEmbeddings, BertEmbeddings

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
language = 'en'
gold_filename = "goldstandard_"+language+".tsv"

def get_gold_sentences(filename):
    gold_sentences = {}
    with open(filename, 'rt') as f_p:
        for line in f_p:
            if line.startswith('"origin"'): # header
                continue
            
            if not line:
                continue
            
            line = line.rstrip()
            line = line.replace('"', '')
            splitted = line.split('\t')
            gold = splitted[0]
            sim_sentences = splitted[1:11]
            
            if gold:
                gold_sentences[gold] = sim_sentences
            
    return gold_sentences

In [3]:
gold_sentences = get_gold_sentences(gold_filename)

In [4]:
def calculate_similarities(gold, sim_sentences, embeddings):
    
    similarities = []
    query = gold

    q = Sentence(query)
    embeddings.embed(q)
    score = 0
    i = 0
    
    for sentence in sim_sentences:
        
        s = Sentence(sentence)
        embeddings.embed(s)

        assert q.embedding.shape == s.embedding.shape
        
        cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
        prox = cos(q.embedding, s.embedding)
    
        similarities.append(round(prox.item(), 4))
        if i > 0 and similarities[i] <= similarities[i-1]:
            score += 1
        i += 1
        
    return similarities, score

def print_similarities(gold, sim_sentences, similarities, score):
        
    print(f"Example: {gold}")
    for sentence, similarity in zip(sim_sentences, similarities):
        print(f"{sentence} - {similarity}")
    print(f"Score: {score}\n")  
    

In [5]:
# load flair embeddings
if language == 'eu':
    flair_embeds = 'eu'
elif language == 'en':
    flair_embeds = 'mix'
flair_embeddings = DocumentPoolEmbeddings([FlairEmbeddings(flair_embeds+'-forward'), FlairEmbeddings(flair_embeds+'-backward')])

In [6]:
# load BERT embeddings
# See BERT paper, section 5.3 and table 7
bert_layers = '-1,-2,-3,-4'

if language == 'en':
    bert_embed = 'bert-base-'
else:
    bert_embed = 'bert-base-multilingual-'

bert_cased_embeddings = DocumentPoolEmbeddings([BertEmbeddings(bert_embed + 'cased', layers=bert_layers)])
bert_uncased_embeddings = DocumentPoolEmbeddings([BertEmbeddings(bert_embed + 'uncased', layers=bert_layers)])


2019-07-03 21:14:07,584 The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [7]:
# load ELMo and ELMo multilingual embeddings

In [8]:
def calculate_and_print(embeddings, similarities_all, scores_all):
    i = 0
    for gold, sim_sentences in gold_sentences.items():
        similarities, score = calculate_similarities(gold, sim_sentences, embeddings)
        scores_all[i].append(score)
        print_similarities(gold, sim_sentences, similarities, score)
        similarities_all[i].append(similarities)
        i += 1
    return similarities_all, scores_all

In [9]:
similarities_all = []
for i in range(len(gold_sentences)):
    similarities_all.append([])

In [10]:
scores_all = []
for i in range(len(gold_sentences)):
    scores_all.append([])

In [11]:
# calculate similarities based on flair embeddings
similarities_all, scores_all = calculate_and_print(flair_embeddings, similarities_all, scores_all)

Example: Jokin invited Amaia for lunch
Jokin offered an invitation to Amaia for lunch - 0.9032
Amaia received the question from Jokin, if she would like to go for lunch - 0.7663
Jokin offered an invitation to Amaia for a meal - 0.8483
Jokin and Amaia met for lunch - 0.9063
Amaia invited Jokin for lunch - 0.9816
Amaia and Jokin are friends - 0.7295
I went for lunch yesterday - 0.7229
I invited my mom to a trip - 0.7025
Jokin enjoys having lunch - 0.8233
I saw some ducks at the park - 0.6765
Score: 5

Example: Mikel went to the mountain
Up the mountain went Mikel - 0.7971
Mikel was in the mountain - 0.8545
Mikel made the plan to go to the mountain - 0.8723
Mikel made the comment that maybe he'd go to the mountain - 0.8243
Mikel didn't go to the mountain - 0.9017
Mikel likes hiking - 0.6638
The mountain looks beautiful today - 0.675
The bird went to the park - 0.8453
There are very good films in the cinema now - 0.5813
Score: 3

Example: Yesterday you came to our house for dinner
You came

In [12]:
# calculate similarities based on bert uncased embeddings
similarities_all, scores_all = calculate_and_print(bert_uncased_embeddings, similarities_all, scores_all)

Example: Jokin invited Amaia for lunch
Jokin offered an invitation to Amaia for lunch - 0.9196
Amaia received the question from Jokin, if she would like to go for lunch - 0.805
Jokin offered an invitation to Amaia for a meal - 0.8579
Jokin and Amaia met for lunch - 0.8632
Amaia invited Jokin for lunch - 0.9889
Amaia and Jokin are friends - 0.7658
I went for lunch yesterday - 0.6821
I invited my mom to a trip - 0.6734
Jokin enjoys having lunch - 0.7937
I saw some ducks at the park - 0.6475
Score: 5

Example: Mikel went to the mountain
Up the mountain went Mikel - 0.788
Mikel was in the mountain - 0.8283
Mikel made the plan to go to the mountain - 0.8574
Mikel made the comment that maybe he'd go to the mountain - 0.7929
Mikel didn't go to the mountain - 0.871
Mikel likes hiking - 0.6668
The mountain looks beautiful today - 0.6549
The bird went to the park - 0.7609
There are very good films in the cinema now - 0.5628
Score: 4

Example: Yesterday you came to our house for dinner
You came f

In [13]:
# calculate similarities based on bert cased embeddings
similarities_all, scores_all = calculate_and_print(bert_cased_embeddings, similarities_all, scores_all)

Example: Jokin invited Amaia for lunch
Jokin offered an invitation to Amaia for lunch - 0.9666
Amaia received the question from Jokin, if she would like to go for lunch - 0.9206
Jokin offered an invitation to Amaia for a meal - 0.9576
Jokin and Amaia met for lunch - 0.9646
Amaia invited Jokin for lunch - 0.9947
Amaia and Jokin are friends - 0.9214
I went for lunch yesterday - 0.8847
I invited my mom to a trip - 0.9068
Jokin enjoys having lunch - 0.9354
I saw some ducks at the park - 0.8632
Score: 4

Example: Mikel went to the mountain
Up the mountain went Mikel - 0.9144
Mikel was in the mountain - 0.9617
Mikel made the plan to go to the mountain - 0.9644
Mikel made the comment that maybe he'd go to the mountain - 0.9407
Mikel didn't go to the mountain - 0.9594
Mikel likes hiking - 0.8797
The mountain looks beautiful today - 0.8884
The bird went to the park - 0.9375
There are very good films in the cinema now - 0.8628
Score: 3

Example: Yesterday you came to our house for dinner
You cam

In [14]:
fig = plotly.tools.make_subplots(rows=1, cols=10)

for i in range(len(gold_sentences)):
    print(list(gold_sentences.keys())[i] + '\n')
    print("\n".join(gold_sentences[list(gold_sentences.keys())[i]]))
    print("Scores: Flair: " + str(scores_all[i][0]) + ", BERT uncased: " + str(scores_all[i][1]) + ", BERT cased: " + str(scores_all[i][2]))
    
    trace = go.Heatmap(z=similarities_all[i], y=['Flair', 'BERT uncased', 'BERT cased'])
    data=[trace]
    fig.append_trace(trace, 1, i+1)
    iplot(data, filename='basic-heatmap' + str(i))

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y2 ]     [ (1,3) x3,y3 ]     [ (1,4) x4,y4 ]     [ (1,5) x5,y5 ]     [ (1,6) x6,y6 ]     [ (1,7) x7,y7 ]     [ (1,8) x8,y8 ]     [ (1,9) x9,y9 ]     [ (1,10) x10,y10 ]

Jokin invited Amaia for lunch

Jokin offered an invitation to Amaia for lunch
Amaia received the question from Jokin, if she would like to go for lunch
Jokin offered an invitation to Amaia for a meal
Jokin and Amaia met for lunch
Amaia invited Jokin for lunch
Amaia and Jokin are friends
I went for lunch yesterday
I invited my mom to a trip
Jokin enjoys having lunch
I saw some ducks at the park
Scores: Flair: 5, BERT uncased: 5, BERT cased: 4


Mikel went to the mountain

Up the mountain went Mikel
Mikel was in the mountain
Mikel made the plan to go to the mountain
Mikel made the comment that maybe he'd go to the mountain
Mikel didn't go to the mountain
Mikel likes hiking
The mountain looks beautiful today
The bird went to the park
There are very good films in the cinema now
Scores: Flair: 3, BERT uncased: 4, BERT cased: 3


Yesterday you came to our house for dinner

You came for dinner to our house yesterday
You were in our house having dinner yesterday
We all had dinner together yesterday
Yesterday you came for dinner
Yesterday they will come to your house for dinner
The dinner yesterday was very tasty
You came to our town yesterday
We renovated our house
I don't like this weather
Scores: Flair: 5, BERT uncased: 6, BERT cased: 6


Happines isn't doing what you like, but liking what you do

Liking what you do makes you happy, not doing what you want
The source of happines is liking what you do
What is done gladly is happiness
What you do, not what you like, is happiness
Happiness isn't liking what you do, but doing what you like
Happines is shown in actions
Doing what you like is very pretty
Not everything you do can be good
I turned on the computer today
Scores: Flair: 5, BERT uncased: 5, BERT cased: 5


Because I love, I create

I create because I love
I love: therefore, I create
The origin of my creating is love
Because I create I love
Take care of what you love
I love you
What I created has become very famous
I tell my brother I love him every day
My city is very pretty
Scores: Flair: 5, BERT uncased: 5, BERT cased: 4


In [15]:
total_scores = [0] * len(scores_all)
for i in range(len(scores_all)):
    total_scores[0] += scores_all[i][0]
    total_scores[1] += scores_all[i][1]
    total_scores[2] += scores_all[i][2]
print("Total scores: Flair: " + str(total_scores[0]) + ", BERT uncased: " + str(total_scores[1]) + ", BERT cased: " + str(total_scores[2]))


Total scores: Flair: 23, BERT uncased: 25, BERT cased: 22
