# Data

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np


def get_triplets(df):
    triplets = []
    pos = df[df[2] == 1]
    neg = df[df[2] == 0]
    if len(pos) > 0 and len(neg) > 0:
        anchor, pos = pos[0].values[0], pos[1].values
        neg = neg[1].values
        for pos_el in pos:
            for neg_el in neg:
                triplets.append([anchor, pos_el, neg_el])
    return pd.DataFrame(triplets)


for spl in ['train', 'dev']:
    if spl == 'train':
        path = 'datasets/traindevA/{0}/SemEval2016-Task3-CQA-QL-{0}-part1.xml'.format(spl)
    else:
        path ='datasets/traindevA/{0}/SemEval2016-Task3-CQA-QL-{0}.xml'.format(spl)
    xmlTree = ET.parse(path)
    root = xmlTree.getroot()

    label_map = {'PerfectMatch': 1, 'Relevant': 1, 'Irrelevant': 0}

    data = []
    questions = dict()
    for orgq in root:
        orgq_id = orgq.attrib['ORGQ_ID']
        orgq_subj = orgq.find('OrgQSubject').text
        orgq_subj = orgq_subj or ''
        orgq_body = orgq.find('OrgQBody').text
        orgq_body = orgq_body or ''
        questions[orgq_id] = [orgq_subj, orgq_body]
        
        q = orgq.find('Thread').find('RelQuestion')
        q_subj = q.find('RelQSubject').text
        q_subj = q_subj or ''
        q_body = q.find('RelQBody').text
        q_body = q_body or ''
        q_id = q.attrib['RELQ_ID']
        questions[q_id] = [q_subj, q_body]
        label = q.attrib['RELQ_RELEVANCE2ORGQ']

        data.append([orgq_id, q_id, label_map[label]])
        
    # BSC            
    train_pos = []
    for el in data:
        if el[-1] == 1:
            q = questions[el[0]]
            qrel = questions[el[1]]
            train_pos.append([' [SEP] '.join(q), ' [SEP] '.join(qrel), 1])
    pd.DataFrame(train_pos).to_csv('datasets/semevalB/{}.tsv'.format(spl),
                                   index=False, sep='\t', header=False)
    
    # MSE
    train_all = []
    for el in data:
        q = questions[el[0]]
        qrel = questions[el[1]]
        train_all.append([' [SEP] '.join(q), ' [SEP] '.join(qrel), el[2]])
    pd.DataFrame(train_all).to_csv('datasets/semevalB/{}_all.tsv'.format(spl),
                                   index=False, sep='\t', header=False)
    

for spl in ['train', 'dev']:
    data = pd.read_csv('datasets/semevalB/{}_all.tsv'.format(spl), 
                           sep='\t', header=None).groupby(0).apply(get_triplets)
    data.index = np.arange(len(data))
    data.to_csv('datasets/semevalB/{}_triplets.tsv'.format(spl),
                                   index=False, sep='\t', header=False)

for spl in ['test']:
    path ='datasets/traindevA/SemEval2017-task3-English-test.xml'.format(spl)
    xmlTree = ET.parse(path)
    root = xmlTree.getroot()

    label_map = {'PerfectMatch': 1, 'Relevant': 1, 'Irrelevant': 0}

    data = []
    questions = dict()
    for orgq in root:
        orgq_id = orgq.attrib['ORGQ_ID']
        orgq_subj = orgq.find('OrgQSubject').text
        orgq_subj = orgq_subj or ''
        orgq_body = orgq.find('OrgQBody').text
        orgq_body = orgq_body or ''
        questions[orgq_id] = [orgq_subj, orgq_body]
        
        q = orgq.find('Thread').find('RelQuestion')
        q_subj = q.find('RelQSubject').text
        q_subj = q_subj or ''
        q_body = q.find('RelQBody').text
        q_body = q_body or ''
        q_id = q.attrib['RELQ_ID']
        questions[q_id] = [q_subj, q_body]
        label = q.attrib['RELQ_RELEVANCE2ORGQ']

        data.append([orgq_id, q_id, label_map[label]])

    # MSE
    train_all = []
    for el in data:
        q = questions[el[0]]
        qrel = questions[el[1]]
        train_all.append([' [SEP] '.join(q), ' [SEP] '.join(qrel), el[2]])
    pd.DataFrame(train_all).to_csv('datasets/semevalB/test.tsv',
                                   index=False, sep='\t', header=False)

# Train

In [1]:
import sys
sys.path.insert(0, "../lib")

from STSDataReaderBinary import STSDataReaderBinary
from STSDataReaderBinaryPositives import STSDataReaderBinaryPositives
from BSCLoss import BSCLoss, ComboBSCLoss
from BSCShuffler import ShuffledSentencesDataset, ShuffledSentenceTransformer
from BSCShuffler import BSCShuffler, ModelBSCShuffler, ModelExampleBasedShuffler

from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator, SimilarityFunction
from sentence_transformers.readers import *
import pandas as pd
import logging
import csv

from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics import pairwise_distances
from metrics import calculate_metrics
import numpy as np

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

num_runs = 5

ModuleNotFoundError: No module named 'STSDataReaderBinary'

In [None]:
with open('test-metrics-CQA-B.txt', 'a+') as f:
    f.write('BSC, source, 3e-5, tau 0.07\n')
for i in range(num_runs):
    word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=70)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    model = ShuffledSentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')

    sts_reader_pos = STSDataReaderBinaryPositives('datasets/SemevalB', 
                               s1_col_idx=0, s2_col_idx=1, score_col_idx=2,normalize_scores=False, thr=0.6, 
                               get_positives=False)

    sts_reader = STSDataReader('datasets/SemevalB', 
                               s1_col_idx=0, s2_col_idx=1, score_col_idx=2,normalize_scores=False)


    train_batch_size = 30 # 50
    num_epochs = 7  # 4

    train_examples = []
    with open(os.path.join('datasets/SemevalB/', 'train_triplets.tsv'), encoding="utf-8") as fIn:
        reader = csv.reader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            train_examples.append(InputExample(texts=[row[0], row[1], row[2]], label=1))

    train_data_triplet = SentencesDataset(examples=train_examples, model=model)
    train_dataloader_triplet = DataLoader(train_data_triplet, shuffle=True, batch_size=train_batch_size)
    train_loss_triplet = losses.TripletLoss(model=model, triplet_margin=0.5)

    train_data_BSC = ShuffledSentencesDataset(sts_reader_pos.get_examples('train_all.tsv'), model)
    train_dataloader_BSC = DataLoader(train_data_BSC, shuffle=False, batch_size=train_batch_size)
    train_loss_BSC = BSCLoss(model=model, norm_dim=1, tau=0.07)

    train_data = SentencesDataset(sts_reader.get_examples('train_all.tsv'), model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    dev_data = SentencesDataset(sts_reader.get_examples('dev_all.tsv'), model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)

    dev_sentences1 = []
    dev_sentences2 = []
    dev_labels = []
    with open(os.path.join('datasets/SemevalB', "dev_all.tsv"), encoding='utf8') as fIn:
        for row in fIn:
            row = row.split('\t')
            dev_sentences1.append(row[0])
            dev_sentences2.append(row[1])
            dev_labels.append(int(row[2]))

    binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
    binary_acc_evaluator.main_similarity = SimilarityFunction.COSINE

    warmup_steps = math.ceil(len(train_data_BSC)*num_epochs/train_batch_size*0.1)
    model_save_path = 'checkpoints_/BSC_semb'

    shuffler = ModelExampleBasedShuffler(group_size=4, allow_same=True)

    get_ipython().system("rm -rf 'checkpoints_/BSC_semb'")

    model.fit(train_objectives=[(train_dataloader_BSC, train_loss_BSC)],
              evaluator=binary_acc_evaluator,
              epochs=num_epochs,
              evaluation_steps=3000,
              warmup_steps=warmup_steps,
              output_path=model_save_path,
              #shuffler=shuffler,
              #shuffle_idxs=[0],
              optimizer_params={'alpha_lr': 0.1, 'lr': 3e-5},
              )

    model = ShuffledSentenceTransformer('checkpoints_/BSC_semb')
    data = pd.read_csv('datasets/SemevalB/dev_all.tsv', sep='\t', header=None)
    labels = data[2].values
    pairs = list(zip(list(data[0].values), list(data[2].values)))
    preds = 1 - paired_cosine_distances(model.encode(data[0].values), model.encode(data[1].values))

    met = calculate_metrics(pairs, preds, labels, labels)
    with open('test-metrics-CQA-B.txt', 'a+') as f:
        f.write('dev ' +  str(met['value'].values[0]) + ' ' + str(met['value'].values[1]) + '\n')
        
    data = pd.read_csv('datasets/SemevalB/test.tsv', sep='\t', header=None)
    labels = data[2].values
    pairs = list(zip(list(data[0].values), list(data[2].values)))
    preds = 1 - paired_cosine_distances(model.encode(data[0].values), model.encode(data[1].values))

    met = calculate_metrics(pairs, preds, labels, labels)
    with open('test-metrics-CQA-B.txt', 'a+') as f:
        f.write('test ' + str(met['value'].values[0]) + ' ' + str(met['value'].values[1]) + '\n')
    
    model = None