In [None]:
!wget https://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt -P datasets/Antique 

In [None]:
import pandas as pd


ant_table = []
with open('datasets/Antique/antique-train.qrel', 'r') as f:
    for line in f.readlines():
        line = line.replace(' ', '\t').strip().split('\t')
        ant_table.append([line[0], line[1], line[2], line[3]])
ant = pd.DataFrame(ant_table)

ant_table = []
with open('datasets/Antique/antique-train-queries.txt', 'r') as f:
    for line in f.readlines():
        line = line.strip().split('\t')
        ant_table.append([line[0], ' '.join(line[1:])])
ant_texts = pd.DataFrame(ant_table)
ant_texts = dict(zip(ant_texts[0].values, ant_texts[1].values))

ant_table = []
with open('datasets/Antique/antique-collection.txt', 'r') as f:
    for line in f.readlines():
        line = line.strip().split('\t')
        ant_table.append([line[0], ' '.join(line[1:])])
ant_texts = pd.DataFrame(ant_table)
ant_texts = dict(zip(ant_texts[0].values, ant_texts[1].values))

ant[5] = ant[2].apply(lambda x: ant_texts[x])
ant.to_csv('datasets/Antique/train.tsv', sep='\t', header=None, index=False)

In [None]:
import sys
sys.path.insert(0, "../lib")

from STSDataReaderBinary import STSDataReaderBinary
from STSDataReaderBinaryPositives import STSDataReaderBinaryPositives
from BSCLoss import BSCLoss, ComboBSCLoss
from BSCShuffler import ShuffledSentencesDataset, ShuffledSentenceTransformer
from BSCShuffler import BSCShuffler, ModelBSCShuffler, ModelExampleBasedShuffler
from torch.utils.data import DataLoader
import math
import os

from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator, SimilarityFunction
from sentence_transformers.readers import *
import pandas as pd
import logging
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [None]:
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=90)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = ShuffledSentenceTransformer(modules=[word_embedding_model, pooling_model])


sts_reader_pos = STSDataReaderBinaryPositives('datasets/Antique', 
                                              s1_col_idx=4, s2_col_idx=5, score_col_idx=3,
                                              min_score=1, max_score=4, thr=0.5, normalize_scores=True,
                                             get_positives=False)
sts_reader = STSDataReader('datasets/Antique', 
                          s1_col_idx=4, s2_col_idx=5, score_col_idx=3,
                          min_score=1, max_score=4, normalize_scores=True)


train_batch_size = 50
num_epochs = 5


train_data_bsc = ShuffledSentencesDataset(sts_reader_pos.get_examples('train.tsv'), model)
train_loss_bsc = BSCLoss(model=model, tau=0.1, norm_dim=1)
train_dataloader_bsc = DataLoader(train_data_bsc, shuffle=False, batch_size=train_batch_size)

train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
train_loss = losses.CosineSimilarityLoss(model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('test.tsv'), name='test')
evaluator.device = 'cuda'
evaluator.main_similarity = SimilarityFunction.COSINE

warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1)
model_save_path = 'checkpoints/bsc_antique'

shuffler = ModelExampleBasedShuffler(group_size=7, allow_same=True)

model.fit(train_objectives=[(train_dataloader_bsc, train_loss_bsc)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          # optimizer_params={'lr': 3e-5, 'eps': 1e-6, 'correct_bias': False},
          shuffler=shuffler,
          shuffle_idxs=[0],
          output_path=model_save_path)

In [None]:
from sentence_transformers.util import batch_to_device
import torch
from tqdm import tqdm
from sklearn.metrics.pairwise import paired_cosine_distances
import pandas as pd
from metrics import calculate_metrics
import numpy as np


class Predictor:
    def __init__(self, dataloader, show_progress_bar):
        self.dataloader = dataloader
        if show_progress_bar is None:
            show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG)
        self.show_progress_bar = show_progress_bar

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def __call__(self, model):
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        self.dataloader.collate_fn = model.smart_batching_collate

        iterator = self.dataloader
        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert Evaluating")

        for step, batch in enumerate(iterator):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)

        try:
            cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise(e)
        return cosine_scores, labels
    
model = SentenceTransformer('checkpoints/fps_antique')
#model.evaluate(evaluator)

dev_data = SentencesDataset(sts_reader.get_examples('test.tsv'), model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)

predictor = Predictor(dev_dataloader, show_progress_bar=True)
preds, labels = predictor(model)
pairs = list(zip(list(pd.read_csv('datasets/Antique/test.tsv', sep='\t', header=None)[0].values),
         list(pd.read_csv('datasets/Antique/test.tsv', sep='\t', header=None)[2].values)))

calculate_metrics(pairs, np.array(preds) * 3 + 1, np.array(labels) * 3 + 1, (np.array(labels) > 0.5).astype(int))