In [1]:
import os
import numpy as np
import pandas as pd
import math
import csv

import transformers
from tqdm.notebook import trange, tqdm

from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import STSDataReader, TripletReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryEmbeddingSimilarityEvaluator, SequentialEvaluator
from sentence_transformers.readers.InputExample import InputExample

import torch
from torch.utils.data import DataLoader, RandomSampler

from scipy.spatial.distance import cdist


In [2]:
TRAIN_SPLITS_DATA_DIR = 'msmarco/train_data/splitted'

In [3]:
# increase swap size:
# https://superuser.com/questions/1024064/change-swap-file-size-fedora-23

In [4]:
model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

In [4]:
model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [5]:
model_1.get_max_seq_length()

128

In [6]:
sentences = ['A fox lives in a zoo together with dogs.',
            'Sentences are passed as a list of string.', 
            'The quick brown fox jumps over the lazy dog.']


In [None]:
embeddings = model_wiki.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

In [7]:
embeddings = model_1.encode(sentences)
sims = cdist(embeddings[0].reshape(-1,1).T, embeddings[1:], "cosine")[0]
sims

array([0.95112494, 0.58440401])

In [14]:
############

In [15]:
torch.cuda.empty_cache()

In [8]:
#datareader for the regression/raw data
class MyDataReader(STSDataReader):
    '''
    Need to reimplement get_examples method from class STSDataReader because
    our csv file has a header.
    '''
    
    def __init__(self, dataset_folder, **kwargs):
        super(MyDataReader,self).__init__(dataset_folder, **kwargs)
        
    def get_examples(self, filename, header=True, max_examples=0):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
                          delimiter=self.delimiter, quoting=self.quoting)
        if header:
            next(data, None)  # skip the header
        examples = []
        for id, row in enumerate(data):
            score = float(row[self.score_col_idx])
            if self.normalize_scores:  # Normalize to a 0...1 value
                score = (score - self.min_score) / (self.max_score - self.min_score)

            s1 = row[self.s1_col_idx]
            s2 = row[self.s2_col_idx]
            examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))

            if max_examples > 0 and len(examples) >= max_examples:
                break

        return examples


In [9]:
my_train_data_path = os.path.join(TRAIN_SPLITS_DATA_DIR, 'queries3_sentences_regression')

In [10]:
myreader_regression = MyDataReader(
                       my_train_data_path,
                       s1_col_idx=1,
                       s2_col_idx=3,
                       score_col_idx=2,
                       delimiter=",",
                       quoting=csv.QUOTE_MINIMAL,
                       normalize_scores=False, min_score=0, max_score=1)

myreader_regression.get_examples('queries3_sentences_regression_dev.csv', max_examples=2)

[<sentence_transformers.readers.InputExample.InputExample at 0x7fa22f44f4e0>,
 <sentence_transformers.readers.InputExample.InputExample at 0x7fa22f44f2e8>]

In [11]:
train_data = myreader_regression.get_examples('queries3_sentences_regression_train.csv', max_examples=100000)
train_dataset = SentencesDataset(train_data, show_progress_bar=True, model=model_1)


Convert dataset: 100%|██████████| 100000/100000 [00:40<00:00, 2496.65it/s]


In [32]:
dev_dataset = SentencesDataset(
    myreader_regression.get_examples('queries3_sentences_regression_dev.csv', max_examples=20000), 
    show_progress_bar=True, model=model_1)

Convert dataset: 100%|██████████| 16259/16259 [00:06<00:00, 2588.49it/s]


In [33]:
train_batch_size = 8

In [27]:
dev_dataset_sampler = RandomSampler(dev_dataset, replacement=True, num_samples=20000)

In [28]:
train_dataset_sampler = RandomSampler(train_dataset, replacement=False)

In [29]:
dev_dataloader = DataLoader(dev_dataset, batch_size=train_batch_size)#, sampler=dev_dataset_sampler)

In [30]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size, num_workers=1)#, sampler=train_dataset_sampler)

In [34]:
len(dev_dataset)

16259

In [35]:
len(dev_dataloader)

375

In [36]:
my_model_path = 'msmarco/models/test_model5'

In [37]:
torch.save(dev_dataloader, os.path.join(my_model_path, 'dev_dataloader.pth'))

In [38]:
torch.save(train_dataloader, os.path.join(my_model_path, 'train_dataloader.pth'))