# Data

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np


def get_triplets(df):
    triplets = []
    pos = df[df[2] == 1]
    neg = df[df[2] == 0]
    if len(pos) > 0 and len(neg) > 0:
        anchor, pos = pos[0].values[0], pos[1].values
        neg = neg[1].values
        for pos_el in pos:
            for neg_el in neg:
                triplets.append([anchor, pos_el, neg_el])
    return pd.DataFrame(triplets)


for spl in ['train']:
    if spl == 'train':
        path = 'datasets/traindevA/{0}/SemEval2016-Task3-CQA-QL-{0}-part1-subtaskA.xml'.format(spl)
    else:
        path ='datasets/traindevA/{0}/SemEval2016-Task3-CQA-QL-{0}-subtaskA.xml'.format(spl)
    xmlTree = ET.parse(path)
    root = xmlTree.getroot()

    label_map = {'Good': 1, 'PotentiallyUseful': 0, 'Bad': 0}

    data = []
    questions = dict()
    for thread in root:
        for el in thread:
            if el.tag == 'RelQuestion':
                q_subj = el.find('RelQSubject').text
                q_subj = q_subj or ''
                q_body = el.find('RelQBody').text
                q_body = q_body or ''
                q_id = el.attrib['RELQ_ID']
                questions[q_id] = [q_subj, q_body]
            if el.tag == 'RelComment':
                label = el.attrib['RELC_RELEVANCE2RELQ']
                c_text = el.find('RelCText').text
                data.append([q_id, c_text, label_map[label]])
                
    # BSC            
    train_pos = []
    for el in data:
        if el[-1] == 1:
            q = questions[el[0]]
            train_pos.append([' [SEP] '.join(q), el[1], 1])
    pd.DataFrame(train_pos).to_csv('datasets/semevalA/{}.tsv'.format(spl),
                                   index=False, sep='\t', header=False)
    
    # MSE
    train_all = []
    for el in data:
        q = questions[el[0]]
        train_all.append([' [SEP] '.join(q), el[1], el[2]])
    pd.DataFrame(train_all).to_csv('datasets/semevalA/{}_all.tsv'.format(spl),
                                   index=False, sep='\t', header=False)

for spl in ['train', 'dev']:
    data = pd.read_csv('datasets/semevalA/{}_all.tsv'.format(spl), 
                           sep='\t', header=None).groupby(0).apply(get_triplets)
    data.index = np.arange(len(data))
    data.to_csv('datasets/semevalA/{}_triplets.tsv'.format(spl),
                                   index=False, sep='\t', header=False)

labeled = pd.read_csv('datasets/traindevA/SemEval2017-Task3-CQA-QL-test-subtaskA.xml.subtaskA.relevancy', sep='\t', header=None)

for spl in ['test']:
    path = 'datasets/traindevA/SemEval2017-task3-English-test-subtaskA.xml'
    xmlTree = ET.parse(path)
    root = xmlTree.getroot()

    label_map = {'Good': 1, 'PotentiallyUseful': 0, 'Bad': 0}

    data = []
    questions = dict()
    for thread in root:
        for el in thread:
            if el.tag == 'RelQuestion':
                q_subj = el.find('RelQSubject').text
                q_subj = q_subj or ''
                q_body = el.find('RelQBody').text
                q_body = q_body or ''
                q_id = el.attrib['RELQ_ID']
                questions[q_id] = [q_subj, q_body]
            if el.tag == 'RelComment':
                label = el.attrib['RELC_RELEVANCE2RELQ']
                c_text = el.find('RelCText').text
                data.append([q_id, c_text, label_map[label]])
                
    train_all = []
    for el in data:
        q = questions[el[0]]
        train_all.append([' [SEP] '.join(q), el[1], el[2]])
    pd.DataFrame(train_all).to_csv('datasets/semevalA/{}.tsv'.format(spl),
                                   index=False, sep='\t', header=False)

# Train

In [None]:
import sys
sys.path.insert(0, "../lib")

from STSDataReaderBinary import STSDataReaderBinary
from STSDataReaderBinaryPositives import STSDataReaderBinaryPositives
from BSCLoss import BSCLoss, ComboBSCLoss
from BSCShuffler import ShuffledSentencesDataset, ShuffledSentenceTransformer
from BSCShuffler import BSCShuffler, ModelBSCShuffler, ModelExampleBasedShuffler


from torch.utils.data import DataLoader
import math
import os
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, evaluation
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator, SimilarityFunction
from sentence_transformers.readers import *
import pandas as pd
import logging
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

num_runs = 5

In [None]:
with open('intervals_estimates/test-metrics-CQA-A.txt', 'a+') as f:
    f.write('combo, shuffled ex-based gs 7 with same, mu=0.1, 2e-5, tau 0.055\n')
for i in range(num_runs):
    word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=100)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)

    model = ShuffledSentenceTransformer(modules=[word_embedding_model, pooling_model], device='cuda')

    sts_reader_bin = STSDataReaderBinaryPositives('datasets/SemevalA', 
                                                  s1_col_idx=0, s2_col_idx=1, score_col_idx=2,
                                                  normalize_scores=False, thr=0.6, get_positives=False)
    sts_reader = STSDataReader('datasets/SemevalA', 
                               s1_col_idx=0, s2_col_idx=1, score_col_idx=2, normalize_scores=False)

    train_batch_size = 30 # 50
    num_epochs = 5  # 4

    train_examples = []
    with open(os.path.join('datasets/SemevalA/', 'train_triplets.tsv'), encoding="utf-8") as fIn:
        reader = csv.reader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        for row in reader:
            train_examples.append(InputExample(texts=[row[0], row[1], row[2]], label=1))

    train_data_triplet = SentencesDataset(examples=train_examples, model=model)
    train_dataloader_triplet = DataLoader(train_data_triplet, shuffle=True, batch_size=train_batch_size)
    train_loss_triplet = losses.TripletLoss(model=model, triplet_margin=0.5)

    train_data_bsc = ShuffledSentencesDataset(sts_reader_bin.get_examples('train_all.tsv'), model)
    train_dataloader_bsc = DataLoader(train_data_bsc, shuffle=False, batch_size=train_batch_size)
    train_loss_bsc = ComboBSCLoss(model=model, norm_dim=1, tau=0.055, mu=0.1)

    train_data = SentencesDataset(sts_reader.get_examples('train_all.tsv'), model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    dev_sentences1 = []
    dev_sentences2 = []
    dev_labels = []
    with open(os.path.join('datasets/SemevalA', "dev_all.tsv"), encoding='utf8') as fIn:
        for row in fIn:
            row = row.strip().split('\t')
            dev_sentences1.append(row[0])
            dev_sentences2.append(row[1])
            dev_labels.append(int(float(row[2])))
    binary_acc_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)
    binary_acc_evaluator.main_similarity = SimilarityFunction.COSINE

    warmup_steps = math.ceil(len(train_data_triplet)*num_epochs/train_batch_size*0.1)
    model_save_path = 'checkpoints_/bsc_sema'

    get_ipython().system("rm -rf 'checkpoints_/bsc_sema'")

    shuffler = ModelExampleBasedShuffler(group_size=7, allow_same=True)

    model.fit(train_objectives=[(train_dataloader_bsc, train_loss_bsc)],
              evaluator=binary_acc_evaluator,
              epochs=num_epochs,
              evaluation_steps=10000,
              warmup_steps=warmup_steps,
              output_path=model_save_path,
              optimizer_params={'alpha_lr': 0.4, 'lr': 2e-5},
              shuffler=shuffler,
              shuffle_idxs=[0]
             )

    model = SentenceTransformer('checkpoints_/bsc_sema')

    import pandas as pd
    
    data = pd.read_csv('datasets/SemevalA/dev_all.tsv', sep='\t', header=None)
    labels = data[2].values
    pairs = list(zip(list(data[0].values), list(data[2].values)))

    from sklearn.metrics.pairwise import paired_cosine_distances
    from sklearn.metrics import pairwise_distances

    preds = 1 - paired_cosine_distances(model.encode(data[0].values), model.encode(data[1].values.astype(str)))

    from metrics import calculate_metrics
    import numpy as np

    met = calculate_metrics(pairs, preds, labels, labels)
    with open('intervals_estimates/test-metrics-CQA-A.txt', 'a+') as f:
        f.write('dev ' + str(met['value'].values[0]) + ' ' + str(met['value'].values[1]) + '\n')
        
    data = pd.read_csv('datasets/SemevalA/test.tsv', sep='\t', header=None)
    labels = data[2].values
    pairs = list(zip(list(data[0].values), list(data[2].values)))

    preds = 1 - paired_cosine_distances(model.encode(data[0].values), model.encode(data[1].values.astype(str)))

    met = calculate_metrics(pairs, preds, labels, labels)
    with open('intervals_estimates/test-metrics-CQA-A.txt', 'a+') as f:
        f.write('test ' + str(met['value'].values[0]) + ' ' + str(met['value'].values[1]) + '\n')
