# Library Import

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

import numpy as np
import pandas as pd

from tabulate import tabulate
from tqdm import trange
import random

# Dataframe load and Treatment

In [None]:
df = pd.read_csv('')

In [None]:
#Drop Nan Values
df.dropna(inplace = True)

In [None]:
# Reset de indexing of the dataframe
df.reset_index(drop = True, inplace = True)

In [None]:
#Changing some values to make the data more compreensible for the model
df = df.replace('PRO', int(1))
df = df.replace('CON', int(0))
df = df.replace(-1, int(0))

In [None]:
# Creating a new row joiniing topic and claim
df['text'] = df['topicText'] + ' [SEP] [CLS] ' + df['claims.claimCorrectedText']

# DataFrame Separation

In this section, I select the important parts of the pandas dataframe, which are: the topic and claim text (str), the topic and claim sentiment classification (int, [-1,1]), the claim and target relation classification (int, [-1,1]), the stance (str, ['PRO', 'CON']) and the split (str, ['train','test']).

The texts will be used in the evaluated models, the classifications will be used to reproduce the formula of the refered article, the stance will be used to evaluate those models and the separation will be used to separate the inputs of the model

In [None]:
split = int(input('Wich split: Random[1] or IBM[2]? '))
method = int(input('Wich method: Pure BERT[1] or IBM[2]? '))

In [None]:
if method == 1:
    text = df['text'].values
    labels = df['claims.stance'].values
    etapa = 0
elif method == 2:
    etapa = int(input('Wich step are you in: topic[1], claim[2] or relation[3]? '))
    if etapa == 1:
        text = df['topicText'].values
        labels = df['topicSentiment'].values
    elif etapa == 2:
        text = df['claims.claimCorrectedText'].values
        labels = df['claims.claimSentiment'].values
    elif etapa == 3:
        text = df['text'].values
        labels = df['claims.targetsRelation'].values

# Functions Definition

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )

In [None]:
def b_tp(preds, labels):
  '''Returns True Positives (TP): count of correct predictions of actual class 1'''
  return sum([preds == labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_fp(preds, labels):
  '''Returns False Positives (FP): count of wrong predictions of actual class 1'''
  return sum([preds != labels and preds == 1 for preds, labels in zip(preds, labels)])

def b_tn(preds, labels):
  '''Returns True Negatives (TN): count of correct predictions of actual class 0'''
  return sum([preds == labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_fn(preds, labels):
  '''Returns False Negatives (FN): count of wrong predictions of actual class 0'''
  return sum([preds != labels and preds == 0 for preds, labels in zip(preds, labels)])

def b_metrics(preds, labels):
  '''
  Returns the following metrics:
    - accuracy    = (TP + TN) / N
    - precision   = TP / (TP + FP)
    - recall      = TP / (TP + FN)
    - specificity = TN / (TN + FP)
  '''
  preds = np.argmax(preds, axis = 1).flatten()
  labels = labels.flatten()
  tp = b_tp(preds, labels)
  tn = b_tn(preds, labels)
  fp = b_fp(preds, labels)
  fn = b_fn(preds, labels)
  b_accuracy = (tp + tn) / len(labels)
  b_precision = tp / (tp + fp) if (tp + fp) > 0 else 'nan'
  b_recall = tp / (tp + fn) if (tp + fn) > 0 else 'nan'
  b_specificity = tn / (tn + fp) if (tn + fp) > 0 else 'nan'
  return b_accuracy, b_precision, b_recall, b_specificity

# Preparing Data for Training

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-large-uncased', # bert-base-uncased or bert-large-uncased
    do_lower_case = True
    )

In [None]:
for sample in text:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids']) 
  attention_masks.append(encoding_dict['attention_mask'])

token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [None]:
val_ratio = 0.2
batch_size = 16

# Indices of the train and validation splits stratified by labels
if split == 1  and (etapa == 0 or etapa == 1):
    train_idx, val_idx = train_test_split(
        np.arange(len(labels)),
        test_size = val_ratio,
        shuffle = True,
        stratify = labels)
if split == 2:
    train_idx = np.array(df.index[df['split'] == 'train'].tolist())
    val_idx = np.array(df.index[df['split'] == 'train'].tolist())

# Train and validation sets
train_set = TensorDataset(token_id[train_idx], 
                          attention_masks[train_idx], 
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx], 
                        attention_masks[val_idx], 
                        labels[val_idx])

# Prepare DataLoader
train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-large-uncased', # bert-base-uncased or bert-large-uncased
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5, #5e-5, 3e-5, 2e-5
                              eps = 1e-08
                              )

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

epochs = 4

# Model Runig and Evaluation

In [None]:
for _ in trange(epochs, desc = 'Epoch'):
    
    # ========== Training ==========
    
    # Set model to training mode
    model.train()
    
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        # Forward pass
        train_output = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        # Backward pass
        train_output.loss.backward()
        optimizer.step()
        # Update tracking variables
        tr_loss += train_output.loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    # ========== Validation ==========

    # Set model to evaluation mode
    model.eval()

    # Tracking variables 
    val_accuracy = []
    val_precision = []
    val_recall = []
    val_specificity = []

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
          # Forward pass
          eval_output = model(b_input_ids, 
                              token_type_ids = None, 
                              attention_mask = b_input_mask)
        logits = eval_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate validation metrics
        b_accuracy, b_precision, b_recall, b_specificity = b_metrics(logits, label_ids)
        val_accuracy.append(b_accuracy)
        # Update precision only when (tp + fp) !=0; ignore nan
        if b_precision != 'nan': val_precision.append(b_precision)
        # Update recall only when (tp + fn) !=0; ignore nan
        if b_recall != 'nan': val_recall.append(b_recall)
        # Update specificity only when (tn + fp) !=0; ignore nan
        if b_specificity != 'nan': val_specificity.append(b_specificity)

    print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
    print('\t - Validation Accuracy: {:.4f}'.format(sum(val_accuracy)/len(val_accuracy)))
    print('\t - Validation Precision: {:.4f}'.format(sum(val_precision)/len(val_precision)) if len(val_precision)>0 else '\t - Validation Precision: NaN')
    print('\t - Validation Recall: {:.4f}'.format(sum(val_recall)/len(val_recall)) if len(val_recall)>0 else '\t - Validation Recall: NaN')
    print('\t - Validation Specificity: {:.4f}\n'.format(sum(val_specificity)/len(val_specificity)) if len(val_specificity)>0 else '\t - Validation Specificity: NaN')

# Model Evaluation

In [None]:
#Selecionar qual base usar
if etapa  == 3 or etapa == 0:
    frases = df['text'][val_idx].values.tolist()
elif etapa  == 1:
    frases = df['topicText'][val_idx].values.tolist()
elif etapa == 2:
    frases = df['claims.claimCorrectedText'][val_idx].values.tolist()

In [None]:
prediction_list = []
for i in frases:
    new_sentence = i

    # We need Token IDs and Attention Mask for inference on the new sentence
    test_ids = []
    test_attention_mask = []

    # Apply the tokenizer
    encoding = preprocessing(new_sentence, tokenizer)

    # Extract IDs and Attention Mask
    test_ids.append(encoding['input_ids'])
    test_attention_mask.append(encoding['attention_mask'])
    test_ids = torch.cat(test_ids, dim = 0)
    test_attention_mask = torch.cat(test_attention_mask, dim = 0)

    # Forward pass, calculate logit predictions
    with torch.no_grad():
      output = model(test_ids.to(device), token_type_ids = None, attention_mask = test_attention_mask.to(device))

    prediction = 1 if np.argmax(output.logits.cpu().numpy()).flatten().item() == 1 else 0
    
    prediction_list.append(prediction)

In [None]:
predictions_list = [[],[],[]]

In [None]:
if etapa == 3 or etapa == 0:
    predictions_list[2] = prediction_list
elif etapa == 1:
    predictions_list[0] = prediction_list
elif etapa == 2:
    predictions_list[1] = prediction_list

In [None]:
if method == 1:
    print('Accuracy: ', accuracy_score(df['claims.stance'][val_idx], predictions_list[2]))
    print('Precision: ', precision_score(df['claims.stance'][val_idx], predictions_list[2]))
    print('Recall: ', recall_score(df['claims.stance'][val_idx], predictions_list[2]))
    print('Specificity: ',recall_score(df['claims.stance'][val_idx], predictions_list[2], pos_label = 0))

if (method == 2 and etapa == 1):
    print('Accuracy: ', accuracy_score(df['claims.stance'][val_idx], predictions_list[0]))
    print('Precision: ', precision_score(df['claims.stance'][val_idx], predictions_list[0]))
    print('Recall: ', recall_score(df['claims.stance'][val_idx], predictions_list[0]))
    print('Specificity: ', recall_score(df['claims.stance'][val_idx], predictions_list[0], pos_label = 0))
          
if (method == 2 and etapa == 2):
    print('Accuracy: ', accuracy_score(df['claims.stance'][val_idx], predictions_list[1]))
    print('Precision: ', precision_score(df['claims.stance'][val_idx], predictions_list[1]))
    print('Recall: ', recall_score(df['claims.stance'][val_idx], predictions_list[1]))
    print('Specificity: ', recall_score(df['claims.stance'][val_idx], predictions_list[1], pos_label = 0))
          
if (method == 2 and etapa == 3):
    print('Accuracy: ', accuracy_score(df['claims.stance'][val_idx], predictions_list[2]))
    print('Precision: ', precision_score(df['claims.stance'][val_idx], predictions_list[2]))
    print('Recall: ', recall_score(df['claims.stance'][val_idx], predictions_list[2]))
    print('Specificity: ', recall_score(df['claims.stance'][val_idx], predictions_list[2], pos_label = 0))

In [None]:
resul_ibm_method = []
for i in range(len(predictions_list[0])):
    resul_ibm_method.append(output_prediction[0][i]*output_prediction[1][i]*output_prediction[2][i])
print('Accuracy: ', accuracy_score(df['claims.stance'][val_idx], resul_ibm_method))
print('Precision: ', precision_score(df['claims.stance'][val_idx], resul_ibm_method))
print('Recall: ', recall_score(df['claims.stance'][val_idx], resul_ibm_method))
print('Specificity: ',recall_score(df['claims.stance'][val_idx], resul_ibm_method, pos_label = 0))

# Saving Model

In [None]:
tokenizer.save_pretrained()
model.save_pretrained()

# Benchmark - IBM API

In [None]:
api_key = '' 

In [None]:
if split == 1:
    print('Benchmark for random spit')
elif split == 2:
    print('Benchmark for original spit')
    
#df_ibm_all = df.loc[[val_idx.tolist()],['topicTarget', 'claims.claimCorrectedText', 'claims.stance']]
#df_ibm = df_ibm_all[val_idx]

df_ibm_topic = df['topicTarget'][val_idx]
df_ibm_claim = df['claims.claimCorrectedText'][val_idx]
df_ibm_target = df['claims.stance'][val_idx]

df_ibm_topic.reset_index(drop = True, inplace = True)
df_ibm_claim.reset_index(drop = True, inplace = True)

sentence_topic_dicts = []
for i in range(len(df_ibm_topic)):
    dicti = {'topic': df_ibm_topic[i], 'sentence': df_ibm_claim[i]}
    sentence_topic_dicts.append(dicti)
    

#sentence_topic_dicts = df_data.to_dict('records')
#list_target = df_target.values.tolist()
#list_target_corrected = []
#for i in range(len(list_target)):
 #   list_target_corrected.append(list_target[i][0])

In [None]:
from debater_python_api.api.debater_api import DebaterApi

debater_api = DebaterApi(api_key)
pro_con_client = debater_api.get_pro_con_client()

scores = pro_con_client.run(sentence_topic_dicts)

resp_ibm = []

for j in range(len(sentence_topic_dicts)):
    if scores[j] > 0:
        resp_ibm.append(1)
    elif scores[j] < 0:
        resp_ibm.append(0)

In [None]:
print('Accuracy: ', accuracy_score(df['claims.stance'][val_idx], resp_ibm))
print('Precision: ', precision_score(df['claims.stance'][val_idx], resp_ibm))
print('Recall: ', recall_score(df['claims.stance'][val_idx], resp_ibm))
print('Specificity: ',recall_score(df['claims.stance'][val_idx], resp_ibm, pos_label = 0))