In [2]:
import os
import re
from sklearn.utils import shuffle
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW

from transformers import get_scheduler
from transformers import T5Tokenizer, T5ForSequenceClassification
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
TASK_INST = {"wow": "Given a chat history separated by new lines, generates an informative, knowledgeable and engaging response. ",
             "pubqa": "Is the following statement correct or not? Say true if it's correct; otherwise say false.",
             "eli5": "Provide a paragraph-length response using simple words to answer the following question.",
             "obqa": "Given four answer candidates, A, B, C and D, choose the best answer choice.",
             "arc_easy": "Given four answer candidates, A, B, C and D, choose the best answer choice.",
             "arc_challenge": "Given four answer candidates, A, B, C and D, choose the best answer choice.",
             "trex": "Given the input format 'Subject Entity [SEP] Relationship Type,' predict the target entity.",
             "asqa": "Answer the following question. The question may be ambiguous and have multiple correct answers, and in that case, you have to provide a long-form answer including all correct answers."}
control_tokens = ["[Fully supported]", "[Partially supported]", "[No support / Contradictory]", "[No Retrieval]", "[Retrieval]",
                  "[Irrelevant]", "[Relevant]", "<paragraph>", "</paragraph>", "[Utility:1]", "[Utility:2]", "[Utility:3]", "[Utility:4]", "[Utility:5]"]

In [4]:
MODEL = '/home/cdsw/models/crag-eval-t5'
DATA_FILE = '/home/cdsw/data/crag-poc-data.txt'

In [5]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)
model = T5ForSequenceClassification.from_pretrained(MODEL).cuda()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def format_prompt(i, task, question, paragraph=None):
    # if task == "bio":
    #     prompt = "### Instruction:\n{0}\n\n### Response:\n".format(question)
    #     if paragraph is not None:
    #         prompt += "[Retrieval]<paragraph>{0}</paragraph>".format(paragraph)
    # else:
    instruction = TASK_INST[task] if task in TASK_INST else None
    instruction = instruction + "\n\n## Input:\n\n" + question if instruction is not None else question
    if task == "arc_challenge":
        with open("../data/arc_challenge/choices", 'r') as f:
            choices = f.readlines()[i].strip()
        choices = choices.replace("A: ", "\nA: ")
        choices = choices.replace("B: ", "\nB: ")
        choices = choices.replace("C: ", "\nC: ")
        choices = choices.replace("D: ", "\nD: ")
        choices = choices.replace("E: ", "\nE: ")
        instruction += choices

    prompt = "### Instruction:\n{0}\n\n### Response:\n".format(instruction)
    if paragraph is not None:
        prompt += "[Retrieval]<paragraph>{0}</paragraph>".format(paragraph)
    return prompt

In [7]:
def postprocess_answer_option_conditioned(answer):
    for token in control_tokens:
        answer = answer.replace(token, "")

    if "</s>" in answer:
        answer = answer.replace("</s>", "")
    if "\n" in answer:
        answer = answer.replace("\n", "")

    if "<|endoftext|>" in answer:
        answer = answer.replace("<|endoftext|>", "")

    return answer

In [8]:
def data_preprocess(file):
    # with_label = True
    with_label = False
    queries = []
    passages = []
    tmp_psgs = []
    with open(file, "r", encoding="utf-8") as f:
        if with_label:
            for line in f.readlines()[:]:
                c, l = line.strip().split("\t")
                q, p = c.split(' [SEP] ')
                if queries == []:
                    queries.append(q)
                    tmp_psgs = [p]
                else:
                    if q != queries[-1]:
                        passages.append(' [sep] '.join(tmp_psgs))
                        queries.append(q)
                        tmp_psgs = [p]
                    else:
                        tmp_psgs.append(p)
            passages.append(' [sep] '.join(tmp_psgs))
        else:
            for line in f.readlines():
                c = line.strip()
                if c.endswith('[SEP]'):
                    c += ' '
                q, p = c.split(' [SEP] ')
                if queries == []:
                    queries.append(q)
                    tmp_psgs = [p]
                else:
                    if q != queries[-1]:
                        passages.append(' [sep] '.join(tmp_psgs))
                        queries.append(q)
                        tmp_psgs = [p]
                    else:
                        tmp_psgs.append(p)
            passages.append(' [sep] '.join(tmp_psgs))
    return queries, passages

In [9]:
def get_evaluator_data(file):
    with_label = False
    # with_label = True
    content = []
    label = []
    with open(file, "r", encoding="utf-8") as f:
        if with_label:
            for line in f.readlines()[:]:
                c, l = line.split("\t")
                content.append(c)
                label.append((int(l.strip()) - 0.5) * 2)
            return content, label
        else:
            for line in f.readlines():
                content.append(line.strip())
            return content, None

In [10]:
def inference(tokenizer, model, file, device=torch.device("cpu"), n_docs=10):
    model.eval()
    content, label = get_evaluator_data(file)

    preds = []
    scores = []

    for c in tqdm(content[:]):
        if c.strip().endswith('[SEP]'):
            preds.append(-1)
            scores.append(-1.0)
            continue
        test = tokenizer(c, return_tensors="pt",padding="max_length",max_length=512)
        with torch.no_grad():  
            outputs = model(test["input_ids"].to(device), 
                            attention_mask=test["attention_mask"].to(device))
        pred_flat = 1 if outputs["logits"].cpu() > 0 else -1
        scores.append(float(outputs["logits"].cpu()))
        preds.append(pred_flat)
    return scores

In [11]:
def process_flag(scores, n_docs, threshold1, threshold2):
    flags = []
    for score in scores:
        if score >= threshold1:
            flags.append('2')
        elif score >= threshold2:
            flags.append('1')
        else:
            flags.append('0')

    tmp_flag = []
    identification_flag = []
    for i, f in enumerate(flags):
        tmp_flag.append(f)
        if i % n_docs == n_docs - 1:
            if '2' in tmp_flag:
                identification_flag.append(2)
            elif '1' in tmp_flag:
                identification_flag.append(1)
            else:
                identification_flag.append(0)
            tmp_flag = []
    return identification_flag

In [12]:
queries, passages = data_preprocess(DATA_FILE)
queries[0], passages[0]

("What is Henry Feilden's occupation?",
 ' Henry Master Feilden (21 February 1818 – 5 September 1875) was an English Conservative Party politician. [sep] Bernard Feilden (1919–2008), British conservation architect ; Bob Feilden (1917–2004), British mechanical engineer ; Gerry Feilden (1904–1981), British general and horse racing identity ; Henry Feilden (disambiguation) ; Joseph Feilden (1824–1895), British politician ; Richard Feilden (1950–2005), British architect ; William Feilden (1772–1850), British politician [sep]  Henry Wemyss Feilden, second son of the second Baronet, was an Arctic explorer. [sep]  Feilden was born in Hampstead, London. He was educated at Bedford School and The Bartlett School of Architecture, University College, London, completing his training at the Architectural Association after the second world war. His love of architecture was inherited from his grandfather, Brightwen Binyon (1846-1905), an Ipswich architect and former pupil of Alfred Waterhouse. He join

In [13]:
content, label = get_evaluator_data(DATA_FILE)
text = content[0]
text

"What is Henry Feilden's occupation? [SEP]  Henry Master Feilden (21 February 1818 – 5 September 1875) was an English Conservative Party politician."

In [14]:
query = "Can you tell me the difference between llamas and alpacas?",
paragraph = "The alpaca (Lama pacos) is a species of South American camelid mammal. It is similar to, and often confused with, the llama. Alpacas are considerably smaller than llamas, and unlike llamas, they were not bred to be working animals, but were bred specifically for their fiber."

In [15]:
query = 'Does Cloudera CDP Base 7.1.7 support the REPL command?',
paragraph = 'If you want to use REPL commands to replicate Hive ACID tables between CDP Private Cloud Base clusters, ensure that your source cluster is on CDP Private Cloud Base 7.1.8 or a higher version.'

In [19]:
query = "What is Russell Stokes's occupation?"
paragraph = '''Kathleen Cody (actress)  Kathleen Cody (born October 30, 1954), often credited as Kathy Cody, is an American actress. She is best known for her role as the characters Hallie Stokes and Carrie Stokes, on the television series "Dark Shadows", appearing from June 1970 through April 1971. Her career in film and television lasted over 30 years.'''

In [17]:
query = 'What is the difference between Cloudera CDP Base 7.1.7 and 7.1.9?'
paragraph = '''What's new in CDP Base 7.1.9 in comparison of 7.1.7
Understand the functionalities and improvements to features of components in Cloudera Runtime 7.1.9.
Open Data Lakehouse, powered by Apache Iceberg
CDP Private Cloud Base 7.1.9 delivers the hybrid Open Data Lakehouse providing the following benefits:
Open architecture
Cloudera’s Open Data Lakehouse, powered by Apache Iceberg is 100% open—open source, open standards based, and with wide community adoption. It can store multiple data formats and enables multiple engines to work on the same data.
Ease of adoption
By integrating Iceberg right into the Shared Data Experience (SDX) and Apache Ozone, Cloudera offers the easiest path to deploying a lakehouse. Additional capabilities like schema evolution, hidden partition, and more simplify data management for large datasets.'''

In [21]:
query = "What is Jim Brown's occupation?"
paragraph = '''Jim Ed Brown  James Edward Brown (April 1, 1934 – June 11, 2015) was an American country singer-songwriter who achieved fame in the 1950s with his two sisters as a member of the Browns. He later had a successful solo career from 1965 to 1974, followed by a string of major duet hits with fellow country music vocalist Helen Cornelius, through 1981. Brown was also the host of the "Country Music Greats Radio Show", a syndicated country music program from Nashville, Tennessee.'''

In [22]:
text = '%s [SEP] %s' % (query, paragraph)
test = tokenizer(text, return_tensors="pt",padding="max_length",max_length=512)
with torch.no_grad():
    outputs = model(test["input_ids"].to('cuda'), attention_mask=test["attention_mask"].to('cuda'))
outputs["logits"]

tensor([[-1.0112]], device='cuda:0')