In [1]:
import os
import numpy as np
import pandas as pd
import math
import csv

import transformers
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import STSDataReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers.InputExample import InputExample

from torch.utils.data import DataLoader

from scipy.spatial.distance import cdist


In [20]:
# increase swap size:
# https://superuser.com/questions/1024064/change-swap-file-size-fedora-23

In [2]:
model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

In [3]:
model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [4]:
model_1.get_max_seq_length()

128

In [5]:
sentences = ['A fox lives in a zoo together with dogs.',
            'Sentences are passed as a list of string.', 
            'The quick brown fox jumps over the lazy dog.']


In [6]:
embeddings = model_wiki.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.01991861, 0.01082202])

In [7]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.95112492, 0.58440415])

In [8]:
############

In [9]:
class MyDataReader(STSDataReader):
    '''
    Need to reimplement get_examples method from class STSDataReader because
    our csv file has a header.
    '''
    
    def __init__(self, dataset_folder, **kwargs):
        super(MyDataReader,self).__init__(dataset_folder, **kwargs)
        
    def get_examples(self, filename, header=True, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """erläutern
        data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
                          delimiter=self.delimiter, quoting=self.quoting)
        if header:
            next(data, None)  # skip the header
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples


In [10]:
myreader = MyDataReader('/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/',
                       s1_col_idx=1,
                       s2_col_idx=4,
                       score_col_idx=2,
                       delimiter=",",
                       quoting=csv.QUOTE_MINIMAL,
                       normalize_scores=False, min_score=0, max_score=1)

myreader.get_examples('queries_od.csv', max_examples=2)

[<sentence_transformers.readers.InputExample.InputExample at 0x7f382bd049e8>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f382bd04a58>]

In [11]:
my_data = SentencesDataset(examples=myreader.get_examples("queries_od.csv", max_examples=100), 
                           model=model_1,
                          show_progress_bar=True)


Convert dataset: 100%|██████████| 100/100 [00:00<00:00, 462.88it/s]


In [None]:
my_data_l = SentenceLabelDataset(examples=myreader.get_examples("queries_od.csv", max_examples=100), 
                           model=model_wiki,
                          show_progress_bar=True)

In [12]:
num_epochs = 1
train_batch_size = 8

warmup_steps = math.ceil(len(my_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up

optimizer_class = transformers.AdamW
optimizer_params = {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}

train_loss = losses.CosineSimilarityLoss(model=model_1)

In [13]:
my_dataloader = DataLoader(my_data, shuffle=False, batch_size=train_batch_size)

In [14]:
MODEL_DIR = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/models'

In [15]:
my_output_path = os.path.join(MODEL_DIR, 'test_model_3')

In [16]:
model_1.fit(train_objectives=[(my_dataloader, train_loss)],
          evaluator=None,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          optimizer_class=optimizer_class,
          optimizer_params=optimizer_params,
          output_path=my_output_path) # works only when you have an evaluator

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/13 [00:00<?, ?it/s][A
Iteration:   8%|▊         | 1/13 [00:13<02:39, 13.28s/it][A
Iteration:  15%|█▌        | 2/13 [00:56<02:42, 14.77s/it][A
Iteration:  23%|██▎       | 3/13 [01:39<02:41, 16.17s/it][A
Iteration:  31%|███       | 4/13 [01:59<02:27, 16.37s/it][A
Iteration:  38%|███▊      | 5/13 [02:20<02:12, 16.59s/it][A
Iteration:  46%|████▌     | 6/13 [02:39<01:57, 16.73s/it][A
Iteration:  54%|█████▍    | 7/13 [02:55<01:40, 16.68s/it][A
Iteration:  62%|██████▏   | 8/13 [03:14<01:24, 16.84s/it][A
Iteration:  69%|██████▉   | 9/13 [03:35<01:08, 17.00s/it][A
Iteration:  77%|███████▋  | 10/13 [03:55<00:51, 17.16s/it][A
Iteration:  85%|████████▍ | 11/13 [04:11<00:34, 17.11s/it][A
Iteration:  92%|█████████▏| 12/13 [04:31<00:17, 17.25s/it][A
Iteration: 100%|██████████| 13/13 [04:43<00:00, 21.81s/it][A
Epoch: 100%|██████████| 1/1 [04:43<00:00, 283.55s/it]


In [17]:
model_1.save(my_output_path)

In [18]:
# load model
# model_1 = SentenceTransformer(my_output_path)

In [19]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.27484048, 0.14570002])