In [None]:
!python download_glue_data.py --data_dir datasets --tasks QQP

In [None]:
import pandas as pd
import csv
import sys
csv.field_size_limit(sys.maxsize)

import os

rows = []
with open(os.path.join('datasets/QQP/dev.tsv'), encoding="utf-8") as fIn:
    reader = csv.reader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for i, row in enumerate(reader):
        if i == 0:
            continue
        if len(row) > 5:
            rows.append(row)
            
            
data = pd.DataFrame(rows)
data.sample(n=3000).to_csv('datasets/QQP/dev_sample.tsv', sep='\t', index=False)
data.head()

In [None]:
import sys
sys.path.insert(0, "../lib")

from STSDataReaderBinary import STSDataReaderBinary
from STSDataReaderBinaryPositives import STSDataReaderBinaryPositives
from BSCLoss import BSCLoss
from BSCShuffler import BSCShuffler, ModelBSCShuffler, ShuffledSentencesDataset, ShuffledSentenceTransformer

from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator, SimilarityFunction
from sentence_transformers.readers import *
import pandas as pd
import logging
import csv
import sys
csv.field_size_limit(sys.maxsize)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [None]:
word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=80)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = ShuffledSentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')


sts_reader_bin = STSDataReaderBinaryPositives('datasets/QQP', quoting=csv.QUOTE_NONE, 
                                              s1_col_idx=3, s2_col_idx=4, score_col_idx=5,
                                              normalize_scores=False, thr=0.6, get_positives=False)
sts_reader = STSDataReaderBinaryPositives('datasets/QQP', quoting=csv.QUOTE_NONE, 
                                              s1_col_idx=3, s2_col_idx=4, score_col_idx=5,
                                              normalize_scores=False, thr=0.6, get_positives=False)


train_batch_size = 50
num_epochs = 6


train_data_bsc = ShuffledSentencesDataset(sts_reader_bin.get_examples('train_sample.tsv'), model)
train_dataloader_bsc = DataLoader(train_data_bsc, shuffle=True, batch_size=train_batch_size)
train_loss_bsc = BSCLoss(model=model, tau=0.1)

train_data = SentencesDataset(sts_reader.get_examples('train_sample.tsv'), model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

dev_data = SentencesDataset(sts_reader.get_examples('dev_sample.tsv'), model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)


evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('dev_sample.tsv'), name='sts-dev')
evaluator.device = 'cuda'
evaluator.main_similarity = SimilarityFunction.COSINE


warmup_steps = math.ceil(len(train_data_bsc)*num_epochs/train_batch_size*0.1)
model_save_path = 'checkpoints_/bsc_qqp'

shuffler = ModelBSCShuffler(group_size=15, by_clusters=True, num_clusters=300,
                            file_name=None, output_file_name=None, column_name=None, max_ind=None)
model.fit(train_objectives=[(train_dataloader_bsc, train_loss_bsc)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          optimizer_params={'lr': 1e-5},
          shuffler=shuffler,
          shuffle_idxs=[0])

In [None]:
from sentence_transformers.util import batch_to_device
import torch
from tqdm import tqdm
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics import f1_score, accuracy_score


class Predictor:
    def __init__(self, dataloader, show_progress_bar):
        self.dataloader = dataloader
        if show_progress_bar is None:
            show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG)
        self.show_progress_bar = show_progress_bar

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def __call__(self, model):
        model.eval()
        embeddings1 = []
        embeddings2 = []
        labels = []

        self.dataloader.collate_fn = model.smart_batching_collate

        iterator = self.dataloader
        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert Evaluating")

        for step, batch in enumerate(iterator):
            features, label_ids = batch_to_device(batch, self.device)
            with torch.no_grad():
                emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]

            labels.extend(label_ids.to("cpu").numpy())
            embeddings1.extend(emb1)
            embeddings2.extend(emb2)

        try:
            cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
        except Exception as e:
            print(embeddings1)
            print(embeddings2)
            raise(e)
        return cosine_scores, labels
    
predictor = Predictor(dev_dataloader, show_progress_bar=True)
preds, labels = predictor(model)

thr = 0.79
f1_score(labels, (preds > thr).astype(float)), accuracy_score(labels, (preds > thr).astype(float))