In [1]:

import os
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import load_dataset
import torch

import transformers
from transformers import AutoTokenizer, SwitchTransformersForConditionalGeneration

from langchain.prompts import PromptTemplate

from IPython.display import Markdown, display

  from .autonotebook import tqdm as notebook_tqdm


### Load the dataset

In [5]:
from datasets import load_dataset
from tqdm import tqdm

dataset = load_dataset("cais/mmlu", 'anatomy')
dataset

DatasetDict({
    auxiliary_train: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 99842
    })
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 135
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 14
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})

In [14]:
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: {prompt}\n
A) {a}\n
B) {b}\n
C) {c}\n
D) {d}\n

Answer:"""

prompt = PromptTemplate(template=template, input_variables=['prompt', 'a', 'b', 'c', 'd'])
prompt

PromptTemplate(input_variables=['a', 'b', 'c', 'd', 'prompt'], template='Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]\n\nQuestion: {prompt}\n\nA) {a}\n\nB) {b}\n\nC) {c}\n\nD) {d}\n\n\nAnswer:')

In [15]:
dataset['auxiliary_train'][0]

{'question': "Davis decided to kill Adams. He set out for Adams's house. Before he got there he saw Brooks, who resembled Adams. Thinking that Brooks was Adams, Davis shot at Brooks. The shot missed Brooks but wounded Case, who was some distance away. Davis had not seen Case. In a prosecution under a statute that proscribes any attempt to commit murder, the district attorney should indicate that the intended victim(s) was/were",
 'subject': '',
 'choices': ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks'],
 'answer': 1}

In [16]:
sample = dataset['auxiliary_train'][0]
display(Markdown(prompt.format(prompt=sample['question'], 
                               a=sample['choices'][0], 
                               b=sample['choices'][1], 
                               c=sample['choices'][2], 
                               d=sample['choices'][3], )))

Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: Davis decided to kill Adams. He set out for Adams's house. Before he got there he saw Brooks, who resembled Adams. Thinking that Brooks was Adams, Davis shot at Brooks. The shot missed Brooks but wounded Case, who was some distance away. Davis had not seen Case. In a prosecution under a statute that proscribes any attempt to commit murder, the district attorney should indicate that the intended victim(s) was/were

A) Adams only.

B) Brooks only.

C) Case only.

D) Adams and Brooks


Answer:

In [17]:
def format_text(example):
    text = prompt.format(prompt=sample['question'], 
                               a=sample['choices'][0], 
                               b=sample['choices'][1], 
                               c=sample['choices'][2], 
                               d=sample['choices'][3], )
    return {"text": text}

In [18]:
dataset = dataset.map(format_text)
dataset

Map: 100%|██████████| 99842/99842 [00:05<00:00, 17815.46 examples/s]
Map: 100%|██████████| 135/135 [00:00<00:00, 7569.53 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1792.05 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 672.42 examples/s]


DatasetDict({
    auxiliary_train: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 99842
    })
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 135
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 14
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 5
    })
})

### Load Model

In [36]:

tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
model = SwitchTransformersForConditionalGeneration.from_pretrained("google/switch-base-8").cuda()
model

SwitchTransformersForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): SwitchTransformersStack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): SwitchTransformersBlock(
        (layer): ModuleList(
          (0): SwitchTransformersLayerSelfAttention(
            (SelfAttention): SwitchTransformersAttention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): SwitchTransformersLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SwitchTransformersLayerFF(
            (mlp): SwitchTransformersDenseActDense(
              (wi): Linear(in_features=768, out_features=3072

### Evaluation

In [63]:
input_ids = tokenizer(
    ("summarize: studies have shown that owning a dog is good for you", "translate English to German: Who is tushar krishna?"), return_tensors="pt", padding=True ).input_ids  # Batch size 1

model.generate(input_ids.cuda(), return_dict_in_generate=True, encoder_router_logits=False, decoder_router_logits=True, output_logits=True)

GenerateEncoderDecoderOutput(sequences=tensor([[    0, 32099,     5,   304, 32098,     6,   752,    22,     7,   497,
            25,    43,     3,     9,  1782,     5,   304, 21603,    10, 32097],
        [    0, 32099,     5,     3,     2,     3,     2,     3,     2,     3,
             2,     3,     2,     3,     2,     3,     2,     3,     2,     3]],
       device='cuda:0'), scores=None, logits=(tensor([[-39.1651,   1.8586, -22.7070,  ..., -41.4877, -42.4018, -42.8918],
        [-36.9136,   2.7749, -22.2301,  ..., -40.1252, -41.0470, -41.5256]],
       device='cuda:0'), tensor([[-55.7076, -11.9879,  -9.5616,  ..., -40.0240, -40.1328, -40.0045],
        [-50.5417,  -5.5957,  -8.0926,  ..., -35.4770, -35.4533, -35.2755]],
       device='cuda:0'), tensor([[-55.0019, -15.7814,  -8.9572,  ..., -39.2135, -39.4048, -39.3479],
        [-51.0003,  -5.8418,  -9.1384,  ..., -35.8548, -35.8421, -35.8112]],
       device='cuda:0'), tensor([[-92.7492, -27.5711, -13.9345,  ..., -52.3770, -52.605

In [72]:
def get_ans(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True)
    # print(text)
    logits = model.generate(input_ids=inputs['input_ids'].to('cuda'), attention_mask=inputs['attention_mask'].cuda(), 
    return_dict_in_generate=True, encoder_router_logits=False, decoder_router_logits=True, output_logits=True).logits
                   
    # print(logits)
    # Create a list of tuples having (logit, 'option') format
    options_list = [(logits[tokenizer(' A').input_ids[-1]], 'A'), (logits[tokenizer(' B').input_ids[-1]], 'B'), (logits[tokenizer(' C').input_ids[-1]], 'C'), (logits[tokenizer(' D').input_ids[-1]], 'D')] 
    options_list = sorted(options_list, reverse=True)
    print(options_list)
    ans_list = []
    for i in range(3):
        ans_list.append(options_list[i][1])
        
    return ans_list

In [73]:
bar = tqdm(enumerate(dataset['dev']), total=len(dataset['dev']))
for i, data in bar:
    # print(i, data)
    # print(data['text'])
    # print(format_text(data))
    ans_list = get_ans(data['text'])
    print('Expected answer:', data['answer'], ' Got ans_list:', ans_list)

 20%|██        | 1/5 [00:04<00:18,  4.67s/it]

[(tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'D'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'C'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'B'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'A')]
Expected answer: 3  Got ans_list: ['D', 'C', 'B']


 40%|████      | 2/5 [00:06<00:09,  3.18s/it]

[(tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'D'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'C'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'B'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'A')]
Expected answer: 3  Got ans_list: ['D', 'C', 'B']


 60%|██████    | 3/5 [00:07<00:03,  1.95s/it]

[(tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'D'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'C'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'B'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'A')]
Expected answer: 2  Got ans_list: ['D', 'C', 'B']


 80%|████████  | 4/5 [00:11<00:03,  3.01s/it]

[(tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'D'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'C'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'B'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'A')]
Expected answer: 2  Got ans_list: ['D', 'C', 'B']


100%|██████████| 5/5 [00:16<00:00,  3.31s/it]

[(tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'D'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'C'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'B'), (tensor([[-55.6612, -19.7385,  -9.2792,  ..., -38.5495, -38.6548, -38.4849]],
       device='cuda:0'), 'A')]
Expected answer: 1  Got ans_list: ['D', 'C', 'B']





### Get Precision

In [21]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)