In [78]:
import os
import numpy as np
import pandas as pd
import math
import csv

import transformers
from tqdm.notebook import trange, tqdm

from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import STSDataReader, TripletReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryEmbeddingSimilarityEvaluator, SequentialEvaluator
from sentence_transformers.readers.InputExample import InputExample

from torch.utils.data import DataLoader, RandomSampler

from scipy.spatial.distance import cdist


In [2]:
TRAIN_SPLITS_DATA_DIR = 'data/msmarco/train_data/splitted'

In [3]:
# increase swap size:
# https://superuser.com/questions/1024064/change-swap-file-size-fedora-23

In [4]:
model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

100%|██████████| 405M/405M [00:24<00:00, 16.7MB/s] 


In [5]:
model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

100%|██████████| 1.31G/1.31G [01:21<00:00, 16.1MB/s] 


In [6]:
model_1.get_max_seq_length()

128

In [7]:
sentences = ['A fox lives in a zoo together with dogs.',
            'Sentences are passed as a list of string.', 
            'The quick brown fox jumps over the lazy dog.']


In [8]:
embeddings = model_wiki.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.01991861, 0.01082202])

In [9]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.95112494, 0.58440401])

In [10]:
############

In [11]:
#datareader for the regression/raw data
class MyDataReader(STSDataReader):
    '''
    Need to reimplement get_examples method from class STSDataReader because
    our csv file has a header.
    '''
    
    def __init__(self, dataset_folder, **kwargs):
        super(MyDataReader,self).__init__(dataset_folder, **kwargs)
        
    def get_examples(self, filename, header=True, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
                          delimiter=self.delimiter, quoting=self.quoting)
        if header:
            next(data, None)  # skip the header
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples


In [23]:
my_train_data_path = os.path.join(TRAIN_SPLITS_DATA_DIR, 'queries3_sentences_regression')

In [12]:
myreader_regression = MyDataReader(
                       my_train_data_path,
                       s1_col_idx=1,
                       s2_col_idx=3,
                       score_col_idx=2,
                       delimiter=",",
                       quoting=csv.QUOTE_MINIMAL,
                       normalize_scores=False, min_score=0, max_score=1)

myreader_regression.get_examples('queries3_sentences_regression_dev.csv', max_examples=2)

[<sentence_transformers.readers.InputExample.InputExample at 0x7f207b1fbcc0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f207b1fbd68>]

In [13]:
train_data = myreader_regression.get_examples('queries3_sentences_regression_train.csv', max_examples=100000)
train_dataset = SentencesDataset(train_data, show_progress_bar=True, model=model_1)


Convert dataset: 100%|██████████| 100000/100000 [00:40<00:00, 2463.77it/s]


In [61]:
dev_dataset = SentencesDataset(
    myreader_regression.get_examples('queries3_sentences_regression_dev.csv'), 
    show_progress_bar=True, model=model_1)





Convert dataset:   0%|          | 0/16259 [00:00<?, ?it/s][A[A[A[A



Convert dataset:   2%|▏         | 279/16259 [00:00<00:05, 2781.64it/s][A[A[A[A



Convert dataset:   3%|▎         | 563/16259 [00:00<00:05, 2796.52it/s][A[A[A[A



Convert dataset:   5%|▍         | 770/16259 [00:00<00:06, 2529.77it/s][A[A[A[A



Convert dataset:   6%|▋         | 1024/16259 [00:00<00:06, 2531.54it/s][A[A[A[A



Convert dataset:   8%|▊         | 1303/16259 [00:00<00:05, 2601.71it/s][A[A[A[A



Convert dataset:   9%|▉         | 1540/16259 [00:00<00:05, 2527.12it/s][A[A[A[A



Convert dataset:  11%|█         | 1802/16259 [00:00<00:05, 2551.70it/s][A[A[A[A



Convert dataset:  13%|█▎        | 2068/16259 [00:00<00:05, 2583.15it/s][A[A[A[A



Convert dataset:  14%|█▍        | 2326/16259 [00:00<00:05, 2580.65it/s][A[A[A[A



Convert dataset:  16%|█▌        | 2607/16259 [00:01<00:05, 2642.63it/s][A[A[A[A



Convert dataset:  18%|█▊        | 2901/16259 [00:01<00:

In [35]:
num_epochs = 1
train_batch_size = 8

warmup_steps = math.ceil(len(train_dataset)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up

optimizer_class = transformers.AdamW
optimizer_params = {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}

train_loss = losses.CosineSimilarityLoss(model=model_1)

In [42]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, num_workers=1)

In [106]:
dev_dataset_sampler = RandomSampler(dev_dataset, replacement=True, num_samples=5000)

In [107]:
dev_dataloader = DataLoader(dev_dataset, batch_size=train_batch_size, sampler=dev_dataset_sampler)

In [108]:
len(dev_dataset)

16259

In [109]:
len(dev_dataloader)

125

In [111]:
evaluator1 = BinaryEmbeddingSimilarityEvaluator(dev_dataloader)
#evaluator2 = MSEEvaluator(dev_dataloader)
evaluator3 = EmbeddingSimilarityEvaluator(dev_dataloader)
evaluator = SequentialEvaluator([evaluator1, evaluator3])

In [112]:
my_model_path = 'data/msmarco/models/test_model4'

In [113]:
model_1.evaluate(evaluator, output_path=os.path.join(my_model_path, 'dev_set_performance_pretrain'))






Evaluating:   0%|          | 0/125 [00:00<?, ?it/s][A[A[A[A[A




Evaluating:   1%|          | 1/125 [00:00<00:59,  2.09it/s][A[A[A[A[A




Evaluating:   2%|▏         | 2/125 [00:00<00:54,  2.26it/s][A[A[A[A[A




Evaluating:   2%|▏         | 3/125 [00:01<00:48,  2.53it/s][A[A[A[A[A




Evaluating:   3%|▎         | 4/125 [00:01<00:44,  2.75it/s][A[A[A[A[A




Evaluating:   4%|▍         | 5/125 [00:01<00:39,  3.04it/s][A[A[A[A[A




Evaluating:   5%|▍         | 6/125 [00:01<00:36,  3.26it/s][A[A[A[A[A




Evaluating:   6%|▌         | 7/125 [00:02<00:37,  3.13it/s][A[A[A[A[A




Evaluating:   6%|▋         | 8/125 [00:02<00:35,  3.33it/s][A[A[A[A[A




Evaluating:   7%|▋         | 9/125 [00:02<00:33,  3.49it/s][A[A[A[A[A




Evaluating:   8%|▊         | 10/125 [00:03<00:32,  3.58it/s][A[A[A[A[A




Evaluating:   9%|▉         | 11/125 [00:03<00:32,  3.55it/s][A[A[A[A[A




Evaluating:  10%|▉         | 12/125 [00:03<00:30,  3.

0.3438173528698871

In [44]:
my_output_path = os.path.join(MODEL_DIR, 'test_model_4')

In [47]:
model_1.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=BinaryEmbeddingSimilarityEvaluator,
          epochs=num_epochs,
          steps_per_epoch=100,
          warmup_steps=warmup_steps,
          optimizer_class=optimizer_class,
          optimizer_params=optimizer_params,
          output_path=my_output_path) # works only when you have an evaluator




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A



Iteration:   0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



Iteration:   1%|          | 1/100 [00:14<24:14, 14.69s/it][A[A[A[A



Iteration:   2%|▏         | 2/100 [00:46<25:22, 15.54s/it][A[A[A[A



Iteration:   3%|▎         | 3/100 [01:00<25:01, 15.48s/it][A[A[A[A



Iteration:   4%|▍         | 4/100 [01:16<24:49, 15.52s/it][A[A[A[A



Iteration:   5%|▌         | 5/100 [01:30<24:22, 15.40s/it][A[A[A[A



Iteration:   6%|▌         | 6/100 [01:54<24:51, 15.87s/it][A[A[A[A



Iteration:   7%|▋         | 7/100 [02:08<24:24, 15.75s/it][A[A[A[A



Iteration:   8%|▊         | 8/100 [02:24<24:09, 15.76s/it][A[A[A[A



Iteration:   9%|▉         | 9/100 [02:37<23:44, 15.65s/it][A[A[A[A



Iteration:  10%|█         | 10/100 [02:51<23:21, 15.57s/it][A[A[A[A



Iteration:  11%|█         | 11/100 [03:15<23:41, 15.98s/it][A[A[A[A



Iteration:  12%|█▏        | 12/100 [03:34<23:36, 16.09s/it

KeyboardInterrupt: 

In [None]:
model_1.evaluate(evaluator)


In [17]:
#model_1.save(my_output_path)

In [18]:
# load model
# model_1 = SentenceTransformer(my_output_path)

In [19]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.27484048, 0.14570002])