In [41]:
import os
import numpy as np
import pandas as pd
import math
import csv

import transformers
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import STSDataReader, TripletReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryEmbeddingSimilarityEvaluator
from sentence_transformers.readers.InputExample import InputExample

from torch.utils.data import DataLoader

from scipy.spatial.distance import cdist


In [14]:
TRAIN_SPLITS_DATA_DIR = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_data/splitted'

In [2]:
# increase swap size:
# https://superuser.com/questions/1024064/change-swap-file-size-fedora-23

In [3]:
model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

In [4]:
model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [5]:
model_1.get_max_seq_length()

128

In [6]:
sentences = ['A fox lives in a zoo together with dogs.',
            'Sentences are passed as a list of string.', 
            'The quick brown fox jumps over the lazy dog.']


In [7]:
embeddings = model_wiki.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.01991861, 0.01082202])

In [8]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.95112492, 0.58440415])

In [9]:
############

In [16]:
#datareader for the regression/raw data
class MyDataReader(STSDataReader):
    '''
    Need to reimplement get_examples method from class STSDataReader because
    our csv file has a header.
    '''
    
    def __init__(self, dataset_folder, **kwargs):
        super(MyDataReader,self).__init__(dataset_folder, **kwargs)
        
    def get_examples(self, filename, header=True, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
                          delimiter=self.delimiter, quoting=self.quoting)
        if header:
            next(data, None)  # skip the header
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples


In [17]:
myreader_regression = MyDataReader(
                       os.path.join(TRAIN_SPLITS_DATA_DIR, 'queries3_sentences_regression'),
                       s1_col_idx=1,
                       s2_col_idx=3,
                       score_col_idx=2,
                       delimiter=",",
                       quoting=csv.QUOTE_MINIMAL,
                       normalize_scores=False, min_score=0, max_score=1)

myreader_regression.get_examples('queries3_sentences_regression_dev.csv', max_examples=2)

[<sentence_transformers.readers.InputExample.InputExample at 0x7f5ecd6d8780>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7f5e3ad58208>]

In [28]:
train_data = myreader_regression.get_examples('queries3_sentences_regression_train.csv', max_examples=100000)
train_dataset = SentencesDataset(train_data, show_progress_bar=True, model=model_1)



Convert dataset:   0%|          | 0/100000 [00:00<?, ?it/s][A
Convert dataset:   0%|          | 299/100000 [00:00<00:33, 2986.58it/s][A
Convert dataset:   1%|          | 629/100000 [00:00<00:32, 3073.53it/s][A
Convert dataset:   1%|          | 920/100000 [00:00<00:32, 3021.69it/s][A
Convert dataset:   1%|          | 1234/100000 [00:00<00:32, 3054.85it/s][A
Convert dataset:   2%|▏         | 1526/100000 [00:00<00:32, 3011.26it/s][A
Convert dataset:   2%|▏         | 1847/100000 [00:00<00:32, 3066.16it/s][A
Convert dataset:   2%|▏         | 2156/100000 [00:00<00:31, 3072.59it/s][A
Convert dataset:   2%|▏         | 2463/100000 [00:00<00:31, 3071.73it/s][A
Convert dataset:   3%|▎         | 2841/100000 [00:00<00:29, 3253.54it/s][A
Convert dataset:   3%|▎         | 3163/100000 [00:01<00:29, 3242.88it/s][A
Convert dataset:   3%|▎         | 3482/100000 [00:01<00:30, 3133.01it/s][A
Convert dataset:   4%|▍         | 3793/100000 [00:01<00:30, 3125.18it/s][A
Convert dataset:   4%|▍    

Convert dataset:  34%|███▎      | 33525/100000 [00:11<00:22, 2894.33it/s][A
Convert dataset:  34%|███▍      | 33823/100000 [00:11<00:22, 2916.87it/s][A
Convert dataset:  34%|███▍      | 34120/100000 [00:11<00:22, 2931.30it/s][A
Convert dataset:  34%|███▍      | 34414/100000 [00:11<00:25, 2600.77it/s][A
Convert dataset:  35%|███▍      | 34682/100000 [00:11<00:26, 2473.61it/s][A
Convert dataset:  35%|███▍      | 34973/100000 [00:11<00:25, 2588.81it/s][A
Convert dataset:  35%|███▌      | 35269/100000 [00:11<00:24, 2688.18it/s][A
Convert dataset:  36%|███▌      | 35543/100000 [00:11<00:23, 2702.71it/s][A
Convert dataset:  36%|███▌      | 35817/100000 [00:12<00:25, 2563.92it/s][A
Convert dataset:  36%|███▌      | 36144/100000 [00:12<00:23, 2740.66it/s][A
Convert dataset:  36%|███▋      | 36425/100000 [00:12<00:23, 2712.01it/s][A
Convert dataset:  37%|███▋      | 36706/100000 [00:12<00:23, 2740.63it/s][A
Convert dataset:  37%|███▋      | 37014/100000 [00:12<00:22, 2832.15it/s][A

Convert dataset:  67%|██████▋   | 67175/100000 [00:22<00:11, 2964.13it/s][A
Convert dataset:  67%|██████▋   | 67473/100000 [00:22<00:10, 2967.80it/s][A
Convert dataset:  68%|██████▊   | 67771/100000 [00:22<00:10, 2956.05it/s][A
Convert dataset:  68%|██████▊   | 68094/100000 [00:22<00:10, 3032.36it/s][A
Convert dataset:  68%|██████▊   | 68418/100000 [00:22<00:10, 3091.25it/s][A
Convert dataset:  69%|██████▊   | 68729/100000 [00:22<00:10, 3052.84it/s][A
Convert dataset:  69%|██████▉   | 69061/100000 [00:22<00:09, 3125.59it/s][A
Convert dataset:  69%|██████▉   | 69375/100000 [00:22<00:10, 3015.35it/s][A
Convert dataset:  70%|██████▉   | 69681/100000 [00:23<00:10, 3027.04it/s][A
Convert dataset:  70%|███████   | 70021/100000 [00:23<00:09, 3129.36it/s][A
Convert dataset:  70%|███████   | 70353/100000 [00:23<00:09, 3184.00it/s][A
Convert dataset:  71%|███████   | 70673/100000 [00:23<00:09, 3180.66it/s][A
Convert dataset:  71%|███████   | 70993/100000 [00:23<00:09, 3119.11it/s][A

Convert dataset:  99%|█████████▉| 98961/100000 [00:33<00:00, 2850.73it/s][A
Convert dataset:  99%|█████████▉| 99255/100000 [00:33<00:00, 2876.34it/s][A
Convert dataset: 100%|█████████▉| 99568/100000 [00:33<00:00, 2945.59it/s][A
Convert dataset: 100%|██████████| 100000/100000 [00:33<00:00, 2943.77it/s][A


In [29]:
dev_dataset = SentencesDataset(
    myreader_regression.get_examples('queries3_sentences_regression_dev.csv', max_examples=100000), 
    show_progress_bar=True, model=model_1)


Convert dataset:   0%|          | 0/16259 [00:00<?, ?it/s][A
Convert dataset:   2%|▏         | 334/16259 [00:00<00:04, 3335.80it/s][A
Convert dataset:   4%|▍         | 634/16259 [00:00<00:04, 3225.67it/s][A
Convert dataset:   5%|▌         | 890/16259 [00:00<00:05, 2987.52it/s][A
Convert dataset:   7%|▋         | 1201/16259 [00:00<00:04, 3022.05it/s][A
Convert dataset:   9%|▉         | 1474/16259 [00:00<00:05, 2927.28it/s][A
Convert dataset:  11%|█         | 1750/16259 [00:00<00:05, 2873.98it/s][A
Convert dataset:  13%|█▎        | 2057/16259 [00:00<00:04, 2928.55it/s][A
Convert dataset:  14%|█▍        | 2330/16259 [00:00<00:04, 2865.33it/s][A
Convert dataset:  16%|█▌        | 2606/16259 [00:00<00:04, 2832.47it/s][A
Convert dataset:  18%|█▊        | 2933/16259 [00:01<00:04, 2950.89it/s][A
Convert dataset:  20%|█▉        | 3221/16259 [00:01<00:04, 2820.45it/s][A
Convert dataset:  22%|██▏       | 3500/16259 [00:01<00:04, 2785.12it/s][A
Convert dataset:  23%|██▎       | 3802/1

In [30]:
"""
myreader_rawdata = MyDataReader('/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/',
                       s1_col_idx=1,
                       s2_col_idx=4,
                       score_col_idx=2,
                       delimiter=",",
                       quoting=csv.QUOTE_MINIMAL,
                       normalize_scores=False, min_score=0, max_score=1)

myreader_rawdata.get_examples('queries_od.csv', max_examples=2)
"""

'\nmyreader_rawdata = MyDataReader(\'/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/\',\n                       s1_col_idx=1,\n                       s2_col_idx=4,\n                       score_col_idx=2,\n                       delimiter=",",\n                       quoting=csv.QUOTE_MINIMAL,\n                       normalize_scores=False, min_score=0, max_score=1)\n\nmyreader_rawdata.get_examples(\'queries_od.csv\', max_examples=2)\n'

In [31]:
"""
my_data = SentencesDataset(examples=myreader.get_examples("queries_od.csv", max_examples=100), 
                           model=model_1,
                          show_progress_bar=True)
"""


'\nmy_data = SentencesDataset(examples=myreader.get_examples("queries_od.csv", max_examples=100), \n                           model=model_1,\n                          show_progress_bar=True)\n'

In [None]:
"""
my_data_l = SentenceLabelDataset(examples=myreader.get_examples("queries_od.csv", max_examples=100), 
                           model=model_wiki,
                          show_progress_bar=True)
"""

In [33]:
num_epochs = 1
train_batch_size = 8

warmup_steps = math.ceil(len(train_dataset)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up

optimizer_class = transformers.AdamW
optimizer_params = {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}

train_loss = losses.CosineSimilarityLoss(model=model_1)

In [38]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)

In [39]:
dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)

In [42]:
evaluator = BinaryEmbeddingSimilarityEvaluator(dev_dataloader)

In [None]:
model_1.evaluate(evaluator)







Evaluating:   0%|          | 0/2033 [00:00<?, ?it/s][A[A[A[A[A




Evaluating:   0%|          | 1/2033 [00:03<2:00:20,  3.55s/it][A[A[A[A[A




Evaluating:   0%|          | 2/2033 [00:09<2:22:58,  4.22s/it][A[A[A[A[A




Evaluating:   0%|          | 3/2033 [00:12<2:08:42,  3.80s/it][A[A[A[A[A




Evaluating:   0%|          | 4/2033 [00:14<1:55:16,  3.41s/it][A[A[A[A[A




Evaluating:   0%|          | 5/2033 [00:18<1:56:23,  3.44s/it][A[A[A[A[A




Evaluating:   0%|          | 6/2033 [00:23<2:15:15,  4.00s/it][A[A[A[A[A




Evaluating:   0%|          | 7/2033 [00:27<2:19:57,  4.15s/it][A[A[A[A[A




Evaluating:   0%|          | 8/2033 [00:31<2:09:53,  3.85s/it][A[A[A[A[A




Evaluating:   0%|          | 9/2033 [00:39<2:52:44,  5.12s/it][A[A[A[A[A

In [43]:
MODEL_DIR = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/models'

In [44]:
my_output_path = os.path.join(MODEL_DIR, 'test_model_4')

In [47]:
model_1.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=BinaryEmbeddingSimilarityEvaluator,
          epochs=num_epochs,
          steps_per_epoch=100,
          warmup_steps=warmup_steps,
          optimizer_class=optimizer_class,
          optimizer_params=optimizer_params,
          output_path=my_output_path) # works only when you have an evaluator




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A[A



Iteration:   0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



Iteration:   1%|          | 1/100 [00:14<24:14, 14.69s/it][A[A[A[A



Iteration:   2%|▏         | 2/100 [00:46<25:22, 15.54s/it][A[A[A[A



Iteration:   3%|▎         | 3/100 [01:00<25:01, 15.48s/it][A[A[A[A



Iteration:   4%|▍         | 4/100 [01:16<24:49, 15.52s/it][A[A[A[A



Iteration:   5%|▌         | 5/100 [01:30<24:22, 15.40s/it][A[A[A[A



Iteration:   6%|▌         | 6/100 [01:54<24:51, 15.87s/it][A[A[A[A



Iteration:   7%|▋         | 7/100 [02:08<24:24, 15.75s/it][A[A[A[A



Iteration:   8%|▊         | 8/100 [02:24<24:09, 15.76s/it][A[A[A[A



Iteration:   9%|▉         | 9/100 [02:37<23:44, 15.65s/it][A[A[A[A



Iteration:  10%|█         | 10/100 [02:51<23:21, 15.57s/it][A[A[A[A



Iteration:  11%|█         | 11/100 [03:15<23:41, 15.98s/it][A[A[A[A



Iteration:  12%|█▏        | 12/100 [03:34<23:36, 16.09s/it

KeyboardInterrupt: 

In [None]:
model_1.evaluate(evaluator)


In [17]:
#model_1.save(my_output_path)

In [18]:
# load model
# model_1 = SentenceTransformer(my_output_path)

In [19]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.27484048, 0.14570002])