In [1]:
from torch.nn.functional import softmax
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import torch
import pickle
import tqdm

In [3]:
model_name = "alan-turing-institute/mt5-large-finetuned-mnli-xtreme-xnli"

tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name, device_map=0, load_in_4bit=True)
model.eval()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MT5Tokenizer'. 
The class this function is called from is 'T5Tokenizer'.
Some weights of MT5ForConditionalGeneration were not initialized from the model checkpoint at alan-turing-institute/mt5-large-finetuned-mnli-xtreme-xnli and are newly initialized: ['decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MT5ForConditionalGeneration(
  (shared): Embedding(250112, 1024)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 1024)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (k): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (v): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (o): Linear4bit(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear4bit(in_features=1024, out_

In [12]:
pairs = open('/home2/aditya_hari/gsoc/rdf-to-text/scraping/notebooks/sent_prop_pairs/sent_prop_pairs_0.txt').readlines()

In [13]:
pairs_ = [p.split('\t') for p in pairs]
premises = [p[0].strip() for p in pairs_]
hypos = [p[1].strip() for p in pairs_]

In [14]:
ENTAILS_LABEL = "▁0"
NEUTRAL_LABEL = "▁1"
CONTRADICTS_LABEL = "▁2"

label_inds = tokenizer.convert_tokens_to_ids(
    [ENTAILS_LABEL, NEUTRAL_LABEL, CONTRADICTS_LABEL])

In [15]:
def process_nli(premise: str, hypothesis: str):
    """ process to required xnli format with task prefix """
    return "".join(['xnli: premise: ', premise, ' hypothesis: ', hypothesis])

In [16]:
pairs = list(zip(premises, hypos))
seqs = [process_nli(premise=premise, hypothesis=hypothesis) for premise, hypothesis in pairs]

In [17]:
batched_seqs = [seqs[i:i+32] for i in range(0, len(seqs), 32)]

In [18]:
entailment_ind = 0
contradiction_ind = 2

In [11]:
all_outputs = []
pb = tqdm.tqdm(range(len(batched_seqs)))
for bno, batch in enumerate(batched_seqs):
  pb.update(1)
  inputs = tokenizer.batch_encode_plus(batch, return_tensors="pt", padding=True).to('cuda')
  out = model.generate(**inputs, output_scores=True, return_dict_in_generate=True, num_beams=1)
  scores = out.scores[0]
  scores = scores[:, label_inds]
  entail_vs_contra_scores = scores[:, [entailment_ind, contradiction_ind]]
  entail_vs_contra_probas = softmax(entail_vs_contra_scores, dim=1)
  batch_scores = torch.argmax(entail_vs_contra_probas, axis=1).cpu().numpy().tolist()
  # for i, val in enumerate(all_outputs):
  #   if(val == 0):
  #     outputs.write(pairs[(bno*32)+i])
  all_outputs.extend(batch_scores)

  0%|          | 94/37908 [00:36<3:45:42,  2.79it/s]

KeyboardInterrupt: 

In [21]:
entailments = []
for i, val in enumerate(all_outputs):
  if(val == 0):
    entailments.append(i, pairs[i])

672

In [None]:
with open('entailments/entailments_0.tsv', 'w', encoding='utf-8') as f:
  for entailment in entailments:
    f.write(f'{entailment[0]}\t{entailment[1]}\n')