In [None]:
ptbr 0.626~0.71~0.740
ptpt 0.706~0.73~0.784

Outras similaridades: Cos, Ovl, Jacc, Dice

Semantic Similarity

These models find semantically similar sentences within one language or across languages:

**distiluse-base-multilingual-cased-v2**: Multilingual knowledge distilled version of multilingual Universal Sentence Encoder. While the original mUSE model only supports 16 languages, this multilingual knowledge distilled version supports 50+ languages.

**xlm-r-distilroberta-base-paraphrase-v1** - Multilingual version of distilroberta-base-paraphrase-v1, trained on parallel data for 50+ languages.

**xlm-r-bert-base-nli-stsb-mean-tokens**: Produces similar embeddings as the bert-base-nli-stsb-mean-token model. Trained on parallel data for 50+ languages.

**distilbert-multilingual-nli-stsb-quora-ranking** - Multilingual version of distilbert-base-nli-stsb-quora-ranking. Fine-tuned with parallel data for 50+ languages.

**T-Systems-onsite/cross-en-de-roberta-sentence-transformer** - Multilingual model for English an German. [More]

# Imports e métodos necessários

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

  import pandas.util.testing as tm


In [2]:
import pandas as pd 
import xml.etree.ElementTree as et 

from scipy.stats import pearsonr

def parse_xml(xml_file):
    """Parse xml to pandas dataframe."""
    xtree = et.parse(xml_file)
    xroot = xtree.getroot() 

    df_cols = ['id', 't', 'h', 'similarity']
    rows = []

    for node in xroot:
        id_ = node.attrib.get("id")
        similarity = node.attrib.get("similarity")
        t = node.find("t").text
        h = node.find("h").text

        rows.append({
            "id": id_,
            "t": t, 
            "h": h,
            "similarity": similarity
        })
    return pd.DataFrame(rows, columns=df_cols, dtype=float)

def eval_similarity(pairs_gold, pairs_sys):
    '''
    Evaluate the semantic similarity output of the system against a gold score. 
    Results are printed to stdout.
    '''
    
    gold_values = np.array(pairs_gold)
    sys_values = np.array(pairs_sys)
    pearson = pearsonr(gold_values, sys_values)[0]
    absolute_diff = gold_values - sys_values
    mse = (absolute_diff ** 2).mean()
    
    print()
    print('Similarity evaluation')
    print('Pearson\t\tMean Squared Error')
    print('-------\t\t------------------')
    print('{:7.3f}\t\t{:18.2f}'.format(pearson, mse))

# Carregando os dados

In [3]:
!ls ../data/assin

assin-ptbr-dev.xml   assin-ptbr-train.xml  assin-ptpt-test.xml
assin-ptbr-test.xml  assin-ptpt-dev.xml    assin-ptpt-train.xml


In [4]:
df_ptbr_train = parse_xml('../data/assin/assin-ptbr-train.xml')
df_ptbr_dev = parse_xml('../data/assin/assin-ptbr-dev.xml')
df_ptbr_test = parse_xml('../data/assin/assin-ptbr-test.xml')

df_ptpt_train = parse_xml('../data/assin/assin-ptpt-train.xml')
df_ptpt_dev = parse_xml('../data/assin/assin-ptpt-dev.xml')
df_ptpt_test = parse_xml('../data/assin/assin-ptpt-test.xml')

In [5]:
print(f'assin-ptbr-train: {df_ptbr_train.shape}')
print(f'assin-ptbr-dev: {df_ptbr_dev.shape}')
print(f'assin-ptbr-test: {df_ptbr_test.shape}')
print()
print(f'assin-ptpt-train: {df_ptpt_train.shape}')
print(f'assin-ptpt-dev: {df_ptpt_dev.shape}')
print(f'assin-ptpt-test: {df_ptpt_test.shape}')

assin-ptbr-train: (2500, 4)
assin-ptbr-dev: (500, 4)
assin-ptbr-test: (2000, 4)

assin-ptpt-train: (2500, 4)
assin-ptpt-dev: (500, 4)
assin-ptpt-test: (2000, 4)


In [6]:
df_ptbr_train.head()

Unnamed: 0,id,t,h,similarity
0,1.0,"A gente faz o aporte financeiro, é como se a e...",Fernando Moraes afirma que não tem vínculo com...,2.0
1,2.0,"Em 2013, a história de como Walt Disney conven...",P.L.Travers era completamente contra a adaptaç...,2.25
2,3.0,"David Silva bateu escanteio, Kompany escalou a...","David Silva cobrou escanteio, o zagueiro se ap...",3.75
3,4.0,"Para os ambientalistas, as metas anunciadas pe...","Dilma aproveitou seu discurso ontem, na Confer...",2.75
4,5.0,"De acordo com a PM, por volta das 10h30 havia ...",O protesto encerrou por volta de 12h15 (horári...,2.0


# Testes pt-br

## Sentence-BERT

### distiluse-base-multilingual-cased-v2

In [70]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [71]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [72]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [73]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [74]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.704		              0.50


In [75]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.682


### xlm-r-distilroberta-base-paraphrase-v1

In [8]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

100%|██████████| 1.01G/1.01G [03:34<00:00, 4.73MB/s]  


In [9]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [10]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [11]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [12]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.713		              0.42


In [13]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.700


### xlm-r-bert-base-nli-stsb-mean-tokens

In [20]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

100%|██████████| 1.01G/1.01G [02:42<00:00, 6.23MB/s]  


In [21]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [22]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [23]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [24]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.668		              0.65


In [25]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.651


### distilbert-multilingual-nli-stsb-quora-ranking

In [28]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

In [29]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [30]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [31]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [32]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.590		              3.03


In [33]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.607


### T-Systems-onsite/cross-en-de-roberta-sentence-transformer

In [64]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

Exception when trying to download https://sbert.net/models/T-Systems-onsite/cross-en-de-roberta-sentence-transformer.zip. Response 404


In [65]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [66]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [67]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [68]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.734		              0.59


In [69]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.720


## Fine-tuning Sentence-BERT

https://www.sbert.net/docs/training/overview.html

https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark_continue_training.py

### distiluse-base-multilingual-cased-v2

In [7]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_finetuning_assin_distiluse-base-multilingual-cased-v2'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)


model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptbr_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptbr_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptbr_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

  return torch._C._cuda_getDeviceCount() > 0


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.732622450729509

In [8]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [9]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [10]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [11]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.752		              0.34


In [12]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.733


### xlm-r-distilroberta-base-paraphrase-v1

In [40]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_finetuning_assin_xlm-r-distilroberta-base-paraphrase-v1'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)


model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptbr_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptbr_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptbr_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.772252063839633

In [41]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [42]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [43]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [44]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.786		              0.30


In [45]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.772


### xlm-r-bert-base-nli-stsb-mean-tokens

In [13]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_finetuning_assin_xlm-r-bert-base-nli-stsb-mean-tokens'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)


model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptbr_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptbr_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptbr_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7369298012434606

In [14]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [15]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [16]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [17]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.754		              0.35


In [18]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.737


### distilbert-multilingual-nli-stsb-quora-ranking

In [19]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_finetuning_assin_distilbert-multilingual-nli-stsb-quora-ranking'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)


model = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptbr_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptbr_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptbr_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7222656932270834

In [20]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [21]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [22]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [23]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.734		              0.39


In [24]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.722


### T-Systems-onsite/cross-en-de-roberta-sentence-transformer

In [25]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_finetuning_assin_T-Systems-onsite_cross-en-de-roberta-sentence-transformer'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)


model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptbr_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptbr_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptbr_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

Exception when trying to download https://sbert.net/models/T-Systems-onsite/cross-en-de-roberta-sentence-transformer.zip. Response 404


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.766929781605059

In [26]:
t_embeddings = model.encode(df_ptbr_test['t'].tolist())
h_embeddings = model.encode(df_ptbr_test['h'].tolist())

In [27]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [28]:
pairs_gold = df_ptbr_test['similarity'].tolist()
pairs_sys = similarities

In [29]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.782		              0.31


In [30]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.767


# Testes pt-pt

## Sentence-BERT

### distiluse-base-multilingual-cased-v2

In [76]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [77]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [78]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [79]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [80]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.724		              0.58


In [81]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.716


### xlm-r-distilroberta-base-paraphrase-v1

In [14]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [15]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [16]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [17]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [18]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.738		              0.65


In [19]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.735


### xlm-r-bert-base-nli-stsb-mean-tokens

In [46]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

In [47]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [48]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [49]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [50]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.707		              0.68


In [51]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.708


### distilbert-multilingual-nli-stsb-quora-ranking

In [52]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

In [53]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [54]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [55]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [56]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.582		              4.05


In [57]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.619


### T-Systems-onsite/cross-en-de-roberta-sentence-transformer

In [58]:
from sentence_transformers import models, SentenceTransformer

model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

Exception when trying to download https://sbert.net/models/T-Systems-onsite/cross-en-de-roberta-sentence-transformer.zip. Response 404


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=541.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1112261175.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=150.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=188.0, style=ProgressStyle(description_…




In [59]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [60]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [61]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [62]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.766		              0.48


In [63]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.760


## Fine-tuning Sentence-BERT

https://www.sbert.net/docs/training/overview.html

https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark_continue_training.py

### distiluse-base-multilingual-cased-v2

In [31]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_ptpt_finetuning_assin_distiluse-base-multilingual-cased-v2'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)



model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptpt_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptpt_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptpt_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7430155184373304

In [32]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [33]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [34]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [35]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.757		              0.60


In [36]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.743


### xlm-r-distilroberta-base-paraphrase-v1

In [37]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_ptpt_finetuning_assin_xlm-r-distilroberta-base-paraphrase-v1'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)



model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptpt_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptpt_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptpt_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7881276217505235

In [38]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [39]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [40]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [41]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.799		              0.52


In [42]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.788


### xlm-r-bert-base-nli-stsb-mean-tokens

In [43]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_ptpt_finetuning_assin_xlm-r-bert-base-nli-stsb-mean-tokens'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)

model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptpt_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptpt_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptpt_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7722189427688181

In [44]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [45]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [46]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [47]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.779		              0.54


In [48]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.772


### distilbert-multilingual-nli-stsb-quora-ranking

In [49]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_ptpt_finetuning_assin_distilbert-multilingual-nli-stsb-quora-ranking'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)

model = SentenceTransformer('distilbert-multilingual-nli-stsb-quora-ranking')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptpt_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptpt_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptpt_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7293983872974558

In [50]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [51]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [52]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [53]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.738		              0.67


In [54]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.729


### T-Systems-onsite/cross-en-de-roberta-sentence-transformer

In [55]:
"""
This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
"""
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import csv
from sentence_transformers import models, SentenceTransformer



# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = '/mnt/data/sbert_ptpt_finetuning_assin_T-Systems-onsite_cross-en-de-roberta-sentence-transformer'# + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



# Load a pre-trained sentence transformer model
#model = SentenceTransformer(model_name)


model = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')

train_samples = []
dev_samples = []
test_samples = []

for i, row in df_ptpt_train.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    train_samples.append(inp_example)
    
for i, row in df_ptpt_dev.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    dev_samples.append(inp_example)
    
for i, row in df_ptpt_test.iterrows():
    inp_example = InputExample(texts=[row['t'], row['h']], label=row['similarity'] / 5)
    test_samples.append(inp_example)
    


train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

Exception when trying to download https://sbert.net/models/T-Systems-onsite/cross-en-de-roberta-sentence-transformer.zip. Response 404


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=157.0, style=ProgressStyle(description_wi…





0.7867330968712181

In [56]:
t_embeddings = model.encode(df_ptpt_test['t'].tolist())
h_embeddings = model.encode(df_ptpt_test['h'].tolist())

In [57]:
from sklearn.metrics.pairwise import  cosine_similarity

similarities = [5.0 * cosine_similarity([t], [h])[0][0] for t, h in zip(t_embeddings, h_embeddings)]

In [58]:
pairs_gold = df_ptpt_test['similarity'].tolist()
pairs_sys = similarities

In [59]:
eval_similarity(pairs_gold, pairs_sys)


Similarity evaluation
Pearson		Mean Squared Error
-------		------------------
  0.793		              0.55


In [60]:
from scipy.stats import spearmanr
print('Spearman correlation: {:7.3f}'.format(spearmanr(pairs_gold, pairs_sys)[0]))

Spearman correlation:   0.787
