In [None]:
import pandas as pd

pairs = pd.read_csv('datasets/Snopes/tweet-veclaim-pairs.tsv', sep='\t')
pairs["score"] = 1 #[0.8 for _ in range(400)] + [1 for _ in range(len(pairs) - 400)]
pairs[:800].to_csv('datasets/Snopes/train.tsv', sep='\t', header=None, index=False)
test = pairs[800:]
veclaims = pd.read_csv('datasets/Snopes/verified-claims.tsv', sep='\t')
new_test = test.copy()
for i in range(5):
    add_test = pd.DataFrame({'claim': test['claim'].values, 'fact': veclaims['fact'].sample(len(test)), 'score': 0})
    new_test = pd.concat([new_test, add_test])
new_test.to_csv('datasets/Snopes/test.tsv', sep='\t', header=None, index=False)

In [None]:
import sys
sys.path.insert(0, "../lib")

from STSDataReaderBinary import STSDataReaderBinary
from STSDataReaderBinaryPositives import STSDataReaderBinaryPositives
from BSCLoss import BSCLoss, ComboBSCLoss
from BSCShuffler import ShuffledSentencesDataset, ShuffledSentenceTransformer
from BSCShuffler import BSCShuffler, ModelBSCShuffler, ModelExampleBasedShuffler

from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator, SimilarityFunction
from sentence_transformers.readers import *
import pandas as pd
import logging
import csv
from unidecode import unidecode

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

num_runs = 5


def clear_text(text):
    text = unidecode(text)
    text = text.replace("'", '"')
    return ' '.join(text.strip())


def has_positive(preds, k, el):
    for pred in preds[:k]:
        if clear_text(el) == clear_text(pred):
            return 1
    return 0

def has_positive_old(preds, k, el):
    return int(el in preds[:k])

In [None]:
with open('intervals_estimates/new_metrics-Snopes.txt', 'a+') as f:
    f.write('bsc, ex-based, 7-same, 2e-5, bias False, norm 0, tau 1.2\n')
for _ in range(num_runs):
    word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=90)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    model = ShuffledSentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')

    sts_reader_pos = STSDataReaderBinaryPositives('datasets/Snopes', 
                               s1_col_idx=0, s2_col_idx=1, score_col_idx=2,normalize_scores=False, thr=0.6,
                                                 get_positives=False)
    sts_reader = STSDataReader('datasets/Snopes', 
                               s1_col_idx=0, s2_col_idx=1, score_col_idx=2,normalize_scores=False)

    train_batch_size = 30
    num_epochs = 6

    train_examples = []
    with open(os.path.join('datasets/Snopes/', 'train_triplets.tsv'), encoding="utf-8") as fIn:
        reader = csv.reader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            train_examples.append(InputExample(texts=[row[0], row[1], row[2]], label=1))

    train_data_triplet = SentencesDataset(examples=train_examples, model=model)
    train_dataloader_triplet = DataLoader(train_data_triplet, shuffle=True, batch_size=train_batch_size)
    train_loss_triplet = losses.TripletLoss(model=model, triplet_margin=0.5)

    train_data_bsc = ShuffledSentencesDataset(sts_reader_pos.get_examples('train.tsv'), model)
    train_dataloader_bsc = DataLoader(train_data_bsc, shuffle=False, batch_size=train_batch_size)
    train_loss_bsc = BSCLoss(model=model, norm_dim=0, tau=1.2)

    train_data = SentencesDataset(sts_reader.get_examples('train.tsv'), model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    dev_sentences1 = []
    dev_sentences2 = []
    dev_labels = []
    with open(os.path.join('datasets/Snopes/', "test.tsv"), encoding='utf8') as fIn:
        for row in fIn.readlines():
            row = row.split('\t')
            dev_sentences1.append(row[0])
            dev_sentences2.append(row[1])
            dev_labels.append(int(row[2]))
    evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)

    shuffler = ModelExampleBasedShuffler(group_size=7, allow_same=True)
    warmup_steps = math.ceil(len(train_data_bsc)*num_epochs/train_batch_size*0.1)
    model_save_path = 'checkpoints_/bsc_snopes_shuffled'
    get_ipython().system("rm -rf 'checkpoints_/bsc_snopes_shuffled'")

    model.fit(train_objectives=[(train_dataloader_bsc, train_loss_bsc)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=model_save_path,
              optimizer_params={'alpha_lr':0.4, 'lr': 2e-5, 'correct_bias': False},
              shuffler=shuffler,
              shuffle_idxs=[0]
             )

    embedder = SentenceTransformer('checkpoints_/bsc_snopes_shuffled') # device='cpu'

    test = pd.read_csv('datasets/Snopes/test.tsv', sep='\t', header=None)
    test_left = embedder.encode(test[0].values)
    gr_test = test_left[:199]
    veclaims = pd.read_csv('datasets/Snopes/verified-claims.tsv', sep='\t')
    one_vs_all_right = embedder.encode(veclaims['fact'].values)

    import numpy as np
    from tqdm import tqdm
    from sklearn.metrics.pairwise import paired_cosine_distances

    sc = np.array([0, 0, 0, 0, 0, 0])
    sc_old = np.array([0, 0, 0, 0, 0, 0])
    for i in tqdm(range(len(gr_test))):
        one_vs_all_left = [gr_test[i] for _ in range(len(veclaims))]
        one_vs_all_sim = 1 - (paired_cosine_distances(one_vs_all_left, one_vs_all_right))
        preds = [el[1] for el in sorted(zip(one_vs_all_sim, veclaims['fact'].values), key=lambda x: -x[0])]
        sc += np.array([has_positive(preds, k, test.loc[i, 1]) for k in [1, 3, 5, 10, 20, 50]])
        sc_old += np.array([has_positive_old(preds, k, test.loc[i, 1]) for k in [1, 3, 5, 10, 20, 50]])
    
    with open('intervals_estimates/new_metrics-Snopes.txt', 'a+') as f:
        ks = [1, 3, 5, 10, 20, 50]
        for i in range(len(ks)):
            f.write('HasPositive@{} {}\n'.format(ks[i], round((sc / len(gr_test))[i], 3)))
        f.write('OLD\n')
        for i in range(len(ks)):
            f.write('HasPositive@{} {}\n'.format(ks[i], round((sc_old / len(gr_test))[i], 3)))
        f.write('\n')