# Performing Zero-shot Evaluation of a Statement Tuned Model

Import the necessary libraries

In [60]:
from datasets import load_dataset, get_dataset_config_names, Dataset
import random
import numpy as np
import torch
import torch.nn.functional as F
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset
from sklearn.utils import resample
import evaluate
from sklearn.utils import resample
from copy import copy
from torch.utils.data import DataLoader

Setting the random seed for reproducibility

In [49]:
SEED = 42
NUM_PROC=5
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

Helper function to assist with creating prompts

In [50]:
def fill_template(templates, values):
    temp = random.sample(templates,1)[0]
    for i in range(len(values)):
        temp = temp.replace("${"+str(i+1)+"}", values[i])
    return temp

Creating two statements to choose from for each data example. One has choice1 the other has choice2. The idea is that the statement tuned model is expected to give the correct choice the higher score.

In [51]:
dataset = "xcopa"
templates = [["The cause of \"${1}\" is that \"${2}\"", "\"${1}\" because \"${2}\"", "\"${1}\" due to \"${2}\""], ["The effect of \"${1}\" is that \"${2}\"", "\"${1}\" therefore \"${2}\"", "\"${1}\", so \"${2}\""]]
split = ['test']
label_column = 'label'
question = 'premise'
choices = ['choice1', 'choice2']
langs = get_dataset_config_names(dataset)[:9]
data = {}
for lang in langs:
    data[lang] = load_dataset(dataset, lang, split=split)
col_names = copy(data[langs[0]][0].column_names)
col_names.remove(label_column)
def create_statements_labels_copa(example):
    template = templates[0] if example['question'] == 'cause' else templates[1]
    temp = random.choice(template)
    example['statement1'] = fill_template([temp], [example[question], example[choices[0]]])
    example['statement2'] = fill_template([temp], [example[question], example[choices[1]]])
    return example

xcopa_statements = {}
for lang in langs:
    xcopa_statements[lang] = [split.map(create_statements_labels_copa, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

You can view what happens when we convert the problem into statements here:

In [52]:
xcopa_statements['et'][:5]

{'label': [0, 0, 1, 0, 0],
 'statement1': ['The cause of "Ese oli mullikilesse mässitud." is that "See oli õrn."',
  'The effect of "Ma tühjendasin oma taskud." is that "Ma leidsin pileti tüki."',
  '"Termiidid tungisid majja sisse." therefore "Termiidid kadusid majast."',
  '"Reisidjad jõudsid piirini." therefore "Piirikontroll kontrollis nende passe."',
  '"Kontor oli kinni." because "Oli puhkus."'],
 'statement2': ['The cause of "Ese oli mullikilesse mässitud." is that "See oli väike."',
  'The effect of "Ma tühjendasin oma taskud." is that "Ma leidsin relva."',
  '"Termiidid tungisid majja sisse." therefore "Termiidid sõid läbi majas oleva puidu."',
  '"Reisidjad jõudsid piirini." therefore "Piirikontroll süüdistas neid smuugeldamises."',
  '"Kontor oli kinni." because "Oli suvi."']}

In [53]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In this case we use the same tokenizer as roberta-base

In [54]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

English only statement tuned model

In [55]:
model_name = "ashabrawy/ST-roberta-base"

In [56]:
model = RobertaForSequenceClassification.from_pretrained(model_name, cache_dir="/scratch/afz225/.cache").eval().to(device)

We create dataloaders for each language to be able to load the data in batches for evaluation

In [58]:
xcopa_dataloaders = {}
for lang in langs:
    xcopa_dataloaders[lang] = DataLoader(xcopa_statements[lang], batch_size=32, shuffle=False)

In [61]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

The following cell runs both statements/choices through the statement tuned model. We get the logits for each statement, combine them then use argmax to find the choice that gives the higher probability. This is our prediction. We use these predictions with the correct labels to calculate the prediction accuracy.

In [64]:
lang_accuracies = {}
for lang in langs:
    predictions = []
    actual_labels = []
    for batch in xcopa_dataloaders[lang]:
        tok1 = tokenizer(batch['statement1'], return_tensors='pt', padding=True).to(device)
        tok2 = tokenizer(batch['statement2'], return_tensors='pt', padding=True).to(device)
        labels = batch['label']
        prob1 = F.softmax(model(input_ids=tok1['input_ids'], attention_mask=tok1['attention_mask']).logits, dim=-1)[:,1]
        prob2 = F.softmax(model(input_ids=tok2['input_ids'], attention_mask=tok2['attention_mask']).logits, dim=-1)[:,1]
        preds = torch.argmax(torch.stack([prob1, prob2],dim=-1),dim=-1)
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())
    lang_accuracies[lang] = clf_metrics.compute(predictions=predictions, references=actual_labels)['accuracy']

Low scores for lang_accuracies expected because our model is trained only on English!

In [66]:
lang_accuracies

{'et': 0.484,
 'ht': 0.526,
 'id': 0.498,
 'it': 0.524,
 'qu': 0.53,
 'sw': 0.468,
 'ta': 0.47,
 'th': 0.478,
 'tr': 0.51}