# Performing Zero-shot Evaluation of a Statement Tuned Model

Import the necessary libraries

In [1]:
!pip install --q datasets evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/542.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K    

In [2]:
from datasets import load_dataset, get_dataset_config_names, Dataset
import random
import numpy as np
import torch
import torch.nn.functional as F
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast
from torch.utils.data import DataLoader
import pandas as pd
from datasets import Dataset
from sklearn.utils import resample
import evaluate
from sklearn.utils import resample
from copy import copy
from torch.utils.data import DataLoader
from itertools import chain

Setting the random seed for reproducibility

In [3]:
SEED = 42
NUM_PROC=5
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

Helper function to assist with creating prompts

In [4]:
def fill_template(templates, values):
    temp = random.sample(templates,1)[0]
    for i in range(len(values)):
        #print(f"i: {i}, values: {values[i]}")
        temp = temp.replace("${"+str(i+1)+"}", values[i])
    return temp

Creating two statements to choose from for each data example. One has choice1 the other has choice2. The idea is that the statement tuned model is expected to give the correct choice the higher score.

In [7]:
def generate_eval(dataset_name, templates, split, label_column, question, choices):
  """
    - dataset_name: Path name to HuggingFace repo
    - templates: 2D list of templates
    - split: list of dataset split we are going to use, i.e. ['split']
    - question: column label
    - choices: optional choices (choice1 vs choice2 that we embed)
  """
  langs = get_dataset_config_names(dataset_name)
  # sanity check: remove any langs that have more than 2 letters (should only be 2 letter code)
  langs = [lang for lang in langs if len(lang) == 2]

  data = {}
  for lang in langs:
    data[lang] = load_dataset(dataset_name, lang, split=split)

  col_names = copy(data[langs[0]][0].column_names)
  col_names.remove(label_column)

  def create_statements_labels(example):
    template = templates

    # choose from the templates given
    temp = random.choice(template)
    # for the chosen template fill
    for i in range(len(choices)):
      # example['statement1'] = fill_template([temp], [example[question], example[choices[0]]])
      example[f'statement{i+1}'] = fill_template([temp], [example[question], example[choices[i]]])
    return example

  resulting_statements = {}
  for lang in langs:
    resulting_statements[lang] = [split.map(create_statements_labels, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

  return resulting_statements

# XCOPA

https://huggingface.co/datasets/cambridgeltl/xcopa

In [10]:
# XCOPA
dataset = "xcopa"
templates = [["The cause of \"${1}\" is that \"${2}\"", "\"${1}\" because \"${2}\"", "\"${1}\" due to \"${2}\""], ["The effect of \"${1}\" is that \"${2}\"", "\"${1}\" therefore \"${2}\"", "\"${1}\", so \"${2}\""]]
split = ['test']
label_column = 'label'
question = 'premise'
choices = ['choice1', 'choice2']
category = ['cause', 'effect']
qtype='question'

xcopa_statements = generate_eval(dataset, templates, split, label_column, question, choices, category, qtype)

  self.pid = os.fork()


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
print(xcopa_statements['et'][0])

{'label': 0, 'statement1': 'The cause of "Ese oli mullikilesse mässitud." is that "See oli õrn."', 'statement2': 'The cause of "Ese oli mullikilesse mässitud." is that "See oli väike."'}


In [12]:
# XNLI
dataset = "facebook/xnli"

# templates: 'Entailment', 'Neutral', 'Contradiction'
templates = [["\"${1}\" entails \"${2}\"", "\"${1}\"? yes, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Entailment"],
             ["\"${1}\" is neutral with regards to \"${2}\"", "\${1}\? maybe, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Neutral"],
            ["\"${1}\" contradicts \"${2}\"", "\"${1}\"? no, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Contradiction"]]

split = ['test']
label_column = 'label'
question = 'premise'
choices = ['hypothesis']
# 0: Entailment, 1: Neutral, 2: Contradiction
category = [0, 1, 2]

xnli_statements = generate_eval(dataset, templates, split, label_column, question, choices, category)

Downloading readme:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/392k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/194k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/447k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/223k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/55.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/356k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/73.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/490k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/50.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/53.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/342k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/173k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/55.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/360k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/183k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/70.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/249k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/70.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/477k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/239k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/76.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/503k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/252k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/48.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/338k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/172k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/428k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/216k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/57.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/364k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/186k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/47.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5010 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

In [None]:
print(xnli_statements['es'][0])

In [8]:
# XWinograd
dataset = "Muennighoff/xwinograd"
templates = ["In \"${1}\", _ is: \"${2}\"", "Q:\"${1}\", A: \"${2}\"", "The missing word in \"${1}\" is \"${2}\"", "_ in: \"${1}\" is \"${2}\"", "\"${1}\", _ is: \"${2}\""]
split = ['test']
label_column = 'answer'
question = 'sentence'
choices = ['option1', 'option2']

#xwinograd_statements = generate_eval(dataset, templates, split, label_column, question, choices)



  self.pid = os.fork()


Map (num_proc=5):   0%|          | 0/2325 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/83 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/959 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/263 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/315 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/504 [00:00<?, ? examples/s]

In [9]:
print(xwinograd_statements['en'][0])

{'answer': '2', 'statement1': '_ in: "The city councilmen refused the demonstrators a permit because _ feared violence." is "the demonstrators"', 'statement2': '_ in: "The city councilmen refused the demonstrators a permit because _ feared violence." is "The city councilmen"'}


In [None]:
# XStoryCloze

dataset = "juletxara/xstory_cloze"
templates = ["\"${1}\" \"${2}\" \"${3}\" \"${4}\" \"${5}\""]
split = ['eval']
label_column = 'answer_right_ending'
input_sentences=['input_sentence_1', 'input_sentence_2', 'input_sentence_3', 'input_sentence_4']
choices=['sentence_quiz1', 'sentence_quiz2']

xstorycloze = generate_eval(dataset, templates, split, label_column, question, choices)

In [None]:
dataset = "xcopa"
templates = [["The cause of \"${1}\" is that \"${2}\"", "\"${1}\" because \"${2}\"", "\"${1}\" due to \"${2}\""], ["The effect of \"${1}\" is that \"${2}\"", "\"${1}\" therefore \"${2}\"", "\"${1}\", so \"${2}\""]]
split = ['test']
label_column = 'label'
question = 'premise'
choices = ['choice1', 'choice2']
langs = get_dataset_config_names(dataset)[:9]


data = {}
for lang in langs:
    data[lang] = load_dataset(dataset, lang, split=split)
col_names = copy(data[langs[0]][0].column_names)
col_names.remove(label_column)

def create_statements_labels_copa(example):
    # choose the cause template (set of 3), or the effect template (set of 3)
    template = templates[0] if example['question'] == 'cause' else templates[1]
    # choose from the 3 templates given
    temp = random.choice(template)
    # for the chosen template fill
    example['statement1'] = fill_template([temp], [example[question], example[choices[0]]])
    example['statement2'] = fill_template([temp], [example[question], example[choices[1]]])
    return example

xcopa_statements = {}
for lang in langs:
    xcopa_statements[lang] = [split.map(create_statements_labels_copa, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

  self.pid = os.fork()


Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/500 [00:00<?, ? examples/s]

You can view what happens when we convert the problem into statements here:

In [None]:
xcopa_statements['et'][0]

{'label': [0, 0, 1, 0, 0],
 'statement1': ['"Ese oli mullikilesse mässitud." because "See oli õrn."',
  '"Ma tühjendasin oma taskud.", so "Ma leidsin pileti tüki."',
  '"Termiidid tungisid majja sisse." therefore "Termiidid kadusid majast."',
  '"Reisidjad jõudsid piirini.", so "Piirikontroll kontrollis nende passe."',
  'The cause of "Kontor oli kinni." is that "Oli puhkus."'],
 'statement2': ['"Ese oli mullikilesse mässitud." because "See oli väike."',
  '"Ma tühjendasin oma taskud.", so "Ma leidsin relva."',
  '"Termiidid tungisid majja sisse." therefore "Termiidid sõid läbi majas oleva puidu."',
  '"Reisidjad jõudsid piirini.", so "Piirikontroll süüdistas neid smuugeldamises."',
  'The cause of "Kontor oli kinni." is that "Oli suvi."']}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In this case we use the same tokenizer as roberta-base

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

English only statement tuned model

In [None]:
model_name = "ashabrawy/ST-roberta-base"

In [None]:
model = RobertaForSequenceClassification.from_pretrained(model_name, cache_dir="/scratch/afz225/.cache").eval().to(device)

config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

We create dataloaders for each language to be able to load the data in batches for evaluation

In [None]:
xcopa_dataloaders = {}
for lang in langs:
    xcopa_dataloaders[lang] = DataLoader(xcopa_statements[lang], batch_size=32, shuffle=False)

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

The following cell runs both statements/choices through the statement tuned model. We get the logits for each statement, combine them then use argmax to find the choice that gives the higher probability. This is our prediction. We use these predictions with the correct labels to calculate the prediction accuracy.

In [None]:
from tqdm import tqdm

lang_accuracies = {}

for lang in langs:
    print(f"Processing {lang}...")
    predictions = []
    actual_labels = []
    for batch in tqdm(xcopa_dataloaders[lang]):
        tok1 = tokenizer(batch['statement1'], return_tensors='pt', padding=True).to(device)
        tok2 = tokenizer(batch['statement2'], return_tensors='pt', padding=True).to(device)
        labels = batch['label']
        prob1 = F.softmax(model(input_ids=tok1['input_ids'], attention_mask=tok1['attention_mask']).logits, dim=-1)[:,1]
        prob2 = F.softmax(model(input_ids=tok2['input_ids'], attention_mask=tok2['attention_mask']).logits, dim=-1)[:,1]
        preds = torch.argmax(torch.stack([prob1, prob2],dim=-1),dim=-1)
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())

    lang_accuracies[lang] = clf_metrics.compute(predictions=predictions, references=actual_labels)['accuracy']

Processing et...


100%|██████████| 16/16 [02:50<00:00, 10.63s/it]


Processing ht...


100%|██████████| 16/16 [02:43<00:00, 10.23s/it]


Processing id...


100%|██████████| 16/16 [03:38<00:00, 13.67s/it]


Processing it...


100%|██████████| 16/16 [02:54<00:00, 10.89s/it]


Processing qu...


100%|██████████| 16/16 [04:37<00:00, 17.36s/it]


Processing sw...


100%|██████████| 16/16 [03:19<00:00, 12.46s/it]


Processing ta...


  0%|          | 0/16 [00:00<?, ?it/s]

Low scores for lang_accuracies expected because our model is trained only on English!

In [None]:
lang_accuracies

# XNLI

https://huggingface.co/datasets/facebook/xnli

In [12]:
dataset = "facebook/xnli"

# templates: 'Entailment', 'Neutral', 'Contradiction'
templates = [["\"${1}\" entails \"${2}\"", "\"${1}\"? yes, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Entailment"],
             ["\"${1}\" is neutral with regards to \"${2}\"", "\${1}\? maybe, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Neutral"],
            ["\"${1}\" contradicts \"${2}\"", "\"${1}\"? no, \"${2}\"", "Premise: \"${1}\", Hypothesis: \"${2}\", label: Contradiction"]]

split = ['test']
label_column = 'label'
question = 'premise'
choices = ['hypothesis']
langs = get_dataset_config_names(dataset)[:9]
langs.remove('all_languages')
print(langs)

data = {}
for lang in langs:
    data[lang] = load_dataset(dataset, lang, split=split)

col_names = copy(data[langs[0]][0].column_names)
col_names.remove(label_column)

def create_statements_labels_xnli(example):
    # right answer
    # 0 if entailment, 1 if netural, 2 if contradiction
    template = templates[example['label']]
    # choose a random template from the 3 options we have
    temp = random.choice(template)
    statements = []
    right_answer = fill_template([temp], [example['premise'], example['hypothesis']])
    statements.append(right_answer)

    # wrong answers
    other_indices = [i for i in range(3) if i != example['label']]
    for other_idx in other_indices:
      temp = random.choice(templates[other_idx])
      statements.append(fill_template([temp], [example['premise'], example['hypothesis']]))

    random.shuffle(statements)

    right_label = 0
    for i in range(len(statements)):
      example[f'statement{i+1}'] = statements[i]
      if statements[i] == right_answer:
        right_label = i

    example['label'] = right_label

    return example

xnli_statements = {}
for lang in langs:
    # for each row in dataset, map the xnli function
    xnli_statements[lang] = [split.map(create_statements_labels_xnli, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi']


  self.pid = os.fork()


Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/5010 [00:00<?, ? examples/s]

In [13]:
print(xnli_statements['en'][0])

{'label': 1, 'statement1': '\\Well, I wasn\'t even thinking about that, but I was so frustrated, and, I ended up talking to him again.\\? maybe, "I havent spoken to him again."', 'statement2': 'Premise: "Well, I wasn\'t even thinking about that, but I was so frustrated, and, I ended up talking to him again.", Hypothesis: "I havent spoken to him again.", label: Contradiction', 'statement3': '"Well, I wasn\'t even thinking about that, but I was so frustrated, and, I ended up talking to him again." entails "I havent spoken to him again."'}


In [None]:
xnli_dataloaders = {}
for lang in langs:
    xnli_dataloaders[lang] = DataLoader(xnli_statements[lang], batch_size=32, shuffle=False)

In [None]:
print(xnli_statements['es'][0])

{'label': 2, 'statement': 'Premise: "Bien, ni estaba pensando en eso, pero estaba tan frustrada y empecé a hablar con él de nuevo.", Hypothesis: "No he vuelto a hablar con él.", label: Contradiction'}


In [None]:
from tqdm import tqdm

lang_accuracies = {}
for lang in langs:
    print(f"Processing {lang}...")
    predictions = []
    actual_labels = []
    for batch in tqdm(xcopa_dataloaders[lang]):
        tok1 = tokenizer(batch['statement'], return_tensors='pt', padding=True).to(device)
        #tok2 = tokenizer(batch['statement2'], return_tensors='pt', padding=True).to(device)
        labels = batch['label']
        prob1 = F.softmax(model(input_ids=tok1['input_ids'], attention_mask=tok1['attention_mask']).logits, dim=-1)[:,1]
        #prob2 = F.softmax(model(input_ids=tok2['input_ids'], attention_mask=tok2['attention_mask']).logits, dim=-1)[:,1]
        preds = torch.argmax(torch.stack([prob1],dim=-1),dim=-1)
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())
    lang_accuracies[lang] = clf_metrics.compute(predictions=predictions, references=actual_labels)['accuracy']

# XWinograd

https://huggingface.co/datasets/Muennighoff/xwinograd

In [None]:
dataset = "Muennighoff/xwinograd"

templates = ["In \"${1}\", _ is: \"${2}\"", "Q:\"${1}\", A: \"${2}\"", "The missing word in \"${1}\" is \"${2}\"", "_ in: \"${1}\" is \"${2}\"", "\"${1}\", _ is: \"${2}\""]

split = ['test']
label_column = 'answer'
question = 'sentence'
choices = ['option1', 'option2']
langs = get_dataset_config_names(dataset)[:9]

data = {}
for lang in langs:
    data[lang] = load_dataset(dataset, lang, split=split)

col_names = copy(data[langs[0]][0].column_names)
col_names.remove(label_column)

def create_statements_labels_xwinograd(example):
    # 0 if entailment, 1 if netural, 2 if contradiction
    #template = templates[example['label']]
    # choose a random template from the 3 options we have
    temp = random.choice(templates) # 1d flat list for XWinograd
    # fill template
    example['statement1'] = fill_template([temp], [example[question], example[choices[0]]])
    example['statement2'] = fill_template([temp], [example[question], example[choices[1]]])
    return example

xwinograd_statements = {}
for lang in langs:
    # for each row in dataset, map the xnli function
    xwinograd_statements[lang] = [split.map(create_statements_labels_xwinograd, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

Map (num_proc=5):   0%|          | 0/315 [00:00<?, ? examples/s]

In [None]:
print(xwinograd_statements['fr'][0])

{'answer': '1', 'statement1': '"La coupe n\'entre pas dans la valise marron, car _ est trop grande.", _ is: "La coupe"', 'statement2': '"La coupe n\'entre pas dans la valise marron, car _ est trop grande.", _ is: "la valise"'}


# XStoryCloze

In [None]:
dataset = "juletxara/xstory_cloze"

templates = ["\"${1}\" \"${2}\" \"${3}\" \"${4}\" \"${5}\""]

split = ['eval']
label_column = 'answer_right_ending'
input_sentences=['input_sentence_1', 'input_sentence_2', 'input_sentence_3', 'input_sentence_4']
choices=['sentence_quiz1', 'sentence_quiz2']
langs = get_dataset_config_names(dataset)[:9]

data = {}
for lang in langs:
    data[lang] = load_dataset(dataset, lang, split=split)

col_names = copy(data[langs[0]][0].column_names)
col_names.remove(label_column)

def create_statements_labels_xstorycloze(example):
    # choose a random template from the 3 options we have
    temp = random.choice(templates) # 1d flat list for XWinograd
    # fill template, in xstory_cloze, we're choosing which sentence quiz we should put in, and 4 input sentences must all go in
    example['statement1'] = fill_template([temp], [example[input_sentences[0]], example[input_sentences[1]], example[input_sentences[2]], example[input_sentences[3]], example[choices[0]]])
    example['statement2'] = fill_template([temp], [example[input_sentences[0]], example[input_sentences[1]], example[input_sentences[2]], example[input_sentences[3]], example[choices[1]]])
    return example

xstorycloze_statements = {}
for lang in langs:
    # for each row in dataset, map the xnli function
    xstorycloze_statements[lang] = [split.map(create_statements_labels_xstorycloze, remove_columns=col_names, num_proc=NUM_PROC) for split in data[lang]][0]

In [None]:
print(xstorycloze_statements['es'][0])

{'answer_right_ending': 2, 'statement1': '"Me volví fan de Ley y Orden en 2011." "Me estaba recuperando de un ataque cerebral." "Cuando volví a casa, intenté ver todos los episodios." "Me costó ver del tirón una serie que lleva 20 años." "Creo que Ley y Orden es una de las peores series que se han hecho."', 'statement2': '"Me volví fan de Ley y Orden en 2011." "Me estaba recuperando de un ataque cerebral." "Cuando volví a casa, intenté ver todos los episodios." "Me costó ver del tirón una serie que lleva 20 años." "Al final, los vi todos."'}
