In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5ForConditionalGeneration, T5Tokenizer, GPT2Tokenizer, GPT2LMHeadModel
import torch
import random
from belief.evaluation import load_facts
from belief.lmbb import get_scores
import json

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [70]:
    
nli_tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli")
nli_model = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli").to(device)

In [176]:
premise = ""
hypothesis = "Albatross is a vertebrate."

input_ids = nli_tokenizer.encode(premise, hypothesis, truncation=True, return_tensors="pt").to(device)
output = nli_model(input_ids)

prediction = torch.softmax(output.logits[0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}

print(prediction)
# entailment, neutral, contradiction

{'entailment': 2.9, 'neutral': 18.7, 'contradiction': 78.4}


In [3]:
def qa_output(question, qa_model, qa_tok):
    input_ids = qa_tok.encode(question, return_tensors="pt").to(device)
    output = qa_model.generate(input_ids)
    return qa_tok.batch_decode(output, skip_special_tokens=True)

In [4]:
qa_model_name = "allenai/unifiedqa-v2-t5-base-1251000"
qa_tok = T5Tokenizer.from_pretrained(qa_model_name)
qa_model = T5ForConditionalGeneration.from_pretrained(qa_model_name).to(device)

In [10]:
qa_model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [5]:
facts = load_facts("./data/silver_facts.json", num_batches=1)[0]

In [67]:
fact_subjects = set()
for fact in facts:
    fact_subjects.add(fact.subject)

In [75]:
idx = random.randint(0, len(facts))
fact = facts[idx]
print(fact)
print((fact.subject, fact.predicate.relation, fact.predicate.object))

(peacock,IsA,water, False, -99999.0)
('peacock', 'IsA', 'water')


In [92]:
qa_output("Is a peacock water? \\n A peacock is a bird. Water is a liquid. Birds are not liquids. \\n (a) yes (b) no", qa_model, qa_tok)

['yes']

In [4]:
gpt_tok = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

In [11]:
question = ""
input_ids = gpt_tok.encode(question, return_tensors="pt").to(device)
output = gpt_model.generate(
    input_ids,
    max_length=20, 
    num_beams=5, 
    early_stopping=True
)
gpt_tok.batch_decode(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['An essay on cats:\n\nhttp://www.nature.com/nature/journal/v']

In [148]:
output

tensor([[ 2061,   318,   281, 43835,    30,   198,   198,  2025, 43835,   318,
           257,  7185,   326,   318,   407,  1692,    13,   632,   318,   257]],
       device='cuda:0')

In [149]:
gpt_tok.batch_decode(output)

['What is an ape?\n\nAn ape is a creature that is not human. It is a']

In [34]:
with open('./data/cqa_train_rand_split.jsonl', 'r') as json_file:
    json_list = list(json_file)

In [41]:
ql = [json.loads(json_str) for json_str in json_list]

In [46]:
ql[4]

{'answerKey': 'C',
 'id': '23505889b94e880c3e89cff4ba119860',
 'question': {'question_concept': 'fox',
  'choices': [{'label': 'A', 'text': 'pretty flowers.'},
   {'label': 'B', 'text': 'hen house'},
   {'label': 'C', 'text': 'natural habitat'},
   {'label': 'D', 'text': 'storybook'},
   {'label': 'E', 'text': 'dense forest'}],
  'stem': 'The fox walked from the city into the forest, what was it looking for?'}}

In [48]:
concept_set = set([q['question']['question_concept'] for q in ql])

In [70]:
len(fact_subjects - concept_set)

74

In [71]:
len(concept_set)

2151