In [1]:

import os
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from tqdm import tqdm
from datasets import load_dataset
import torch

import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, SwitchTransformersForConditionalGeneration

from langchain.prompts import PromptTemplate

from IPython.display import Markdown, display

### Load the dataset

In [6]:
from datasets import load_dataset
from tqdm import tqdm

dataset = load_dataset("cais/mmlu",'astronomy')
dataset

Generating auxiliary_train split:   0%|          | 0/99842 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    auxiliary_train: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 99842
    })
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 152
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 16
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})

In [7]:
template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: {prompt}\n
A) {a}\n
B) {b}\n
C) {c}\n
D) {d}\n

Answer:"""

prompt = PromptTemplate(template=template, input_variables=['prompt', 'a', 'b', 'c', 'd'])
prompt

PromptTemplate(input_variables=['a', 'b', 'c', 'd', 'prompt'], template='Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]\n\nQuestion: {prompt}\n\nA) {a}\n\nB) {b}\n\nC) {c}\n\nD) {d}\n\n\nAnswer:')

In [8]:
dataset['auxiliary_train'][0]

{'question': "Davis decided to kill Adams. He set out for Adams's house. Before he got there he saw Brooks, who resembled Adams. Thinking that Brooks was Adams, Davis shot at Brooks. The shot missed Brooks but wounded Case, who was some distance away. Davis had not seen Case. In a prosecution under a statute that proscribes any attempt to commit murder, the district attorney should indicate that the intended victim(s) was/were",
 'subject': '',
 'choices': ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks'],
 'answer': 1}

In [9]:
sample = dataset['auxiliary_train'][0]
display(Markdown(prompt.format(prompt=sample['question'], 
                               a=sample['choices'][0], 
                               b=sample['choices'][1], 
                               c=sample['choices'][2], 
                               d=sample['choices'][3], )))

Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: Davis decided to kill Adams. He set out for Adams's house. Before he got there he saw Brooks, who resembled Adams. Thinking that Brooks was Adams, Davis shot at Brooks. The shot missed Brooks but wounded Case, who was some distance away. Davis had not seen Case. In a prosecution under a statute that proscribes any attempt to commit murder, the district attorney should indicate that the intended victim(s) was/were

A) Adams only.

B) Brooks only.

C) Case only.

D) Adams and Brooks


Answer:

In [10]:
def format_text(sample):
    text = prompt.format(prompt=sample['question'], 
                               a=sample['choices'][0], 
                               b=sample['choices'][1], 
                               c=sample['choices'][2], 
                               d=sample['choices'][3], )
    return {"text": text}

In [11]:
dataset = dataset.map(format_text)
dataset

Map:   0%|          | 0/99842 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

DatasetDict({
    auxiliary_train: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 99842
    })
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 152
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 16
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer', 'text'],
        num_rows: 5
    })
})

### Load Model

In [13]:

tokenizer = AutoTokenizer.from_pretrained("google/switch-base-32")
model = SwitchTransformersForConditionalGeneration.from_pretrained("google/switch-base-32")
model

SwitchTransformersForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): SwitchTransformersStack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): SwitchTransformersBlock(
        (layer): ModuleList(
          (0): SwitchTransformersLayerSelfAttention(
            (SelfAttention): SwitchTransformersAttention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): SwitchTransformersLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SwitchTransformersLayerFF(
            (mlp): SwitchTransformersDenseActDense(
              (wi): Linear(in_features=768, out_features=3072

### Evaluation

In [15]:
input_ids = tokenizer(
    ("summarize: studies have shown that owning a dog is good for you", "translate English to German: Who is tushar krishna?"), return_tensors="pt", padding=True ).input_ids  # Batch size 1

model.generate(input_ids, return_dict_in_generate=True, encoder_router_logits=False, decoder_router_logits=True, output_logits=True)



GenerateEncoderDecoderOutput(sequences=tensor([[    0, 32099,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [    0, 32099,  2645,    19,     3,    17,   302,  3272,     3, 10648,
             7,   107,    29,     9,    58, 32098, 32097,    58,     3, 32096]]), scores=None, logits=(tensor([[ -56.5279,  -53.8467,  -59.1355,  ..., -111.1666, -108.6548,
         -111.6747],
        [ -57.9992,  -54.5511,  -59.7791,  ..., -112.5967, -110.0983,
         -113.1005]]), tensor([[-31.2476,  -6.7846, -12.8065,  ..., -44.1658, -43.9011, -44.0584],
        [-24.0210,  -9.3919,  -9.8018,  ..., -38.5579, -38.3455, -38.6004]]), tensor([[-24.9932,  -2.2900, -10.1144,  ..., -41.0515, -41.1595, -41.2115],
        [-28.9260, -22.5011, -10.8445,  ..., -46.1690, -46.1313, -46.2785]]), tensor([[ -54.9878,  -52.0071,  -58.3813,  ..., -109.4966, -106.9689,
         -110.0171],
        [ -21.9197,  -18.4452,  -12.0

In [16]:
def get_ans(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True)
    # print(text)
    logits = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], 
    return_dict_in_generate=True, encoder_router_logits=False, decoder_router_logits=True, output_logits=True).logits
                   
    # print(logits)
    # Create a list of tuples having (logit, 'option') format
    options_list = [(logits[tokenizer(' A').input_ids[-1]], 'A'), (logits[tokenizer(' B').input_ids[-1]], 'B'), (logits[tokenizer(' C').input_ids[-1]], 'C'), (logits[tokenizer(' D').input_ids[-1]], 'D')] 
    options_list = sorted(options_list, reverse=True)
    print(options_list)
    ans_list = []
    for i in range(3):
        ans_list.append(options_list[i][1])
        
    return ans_list

In [17]:
bar = tqdm(enumerate(dataset['dev']), total=len(dataset['dev']))
for i, data in bar:
    # print(i, data)
    print(data['text'])
    # print(format_text(data))
    ans_list = get_ans(data['text'])
    print('Expected answer:', data['answer'], ' Got ans_list:', ans_list)

  0%|          | 0/5 [00:00<?, ?it/s]

Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: You are pushing a truck along a road. Would it be easier to accelerate this truck on Mars? Why? (Assume there is no friction)

A) It would be harder since the truck is heavier on Mars.

B) It would be easier since the truck is lighter on Mars.

C) It would be harder since the truck is lighter on Mars.

D) It would be the same no matter where you are.


Answer:


 20%|██        | 1/5 [00:01<00:06,  1.62s/it]

[(tensor([[-24.3918, -17.9444, -13.2590,  ..., -43.0448, -43.0073, -43.3355]]), 'D'), (tensor([[-24.3918, -17.9444, -13.2590,  ..., -43.0448, -43.0073, -43.3355]]), 'C'), (tensor([[-24.3918, -17.9444, -13.2590,  ..., -43.0448, -43.0073, -43.3355]]), 'B'), (tensor([[-24.3918, -17.9444, -13.2590,  ..., -43.0448, -43.0073, -43.3355]]), 'A')]
Expected answer: 3  Got ans_list: ['D', 'C', 'B']
Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: Where do most short-period comets come from and how do we know?

A) The Kuiper belt; short period comets tend to be in the plane of the solar system just like the Kuiper belt.

B) The Kuiper belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the Kuiper belt.

C) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just 

 40%|████      | 2/5 [00:02<00:04,  1.35s/it]

[(tensor([[-23.1798, -12.2505, -11.1941,  ..., -37.8646, -38.0053, -38.0962]]), 'D'), (tensor([[-23.1798, -12.2505, -11.1941,  ..., -37.8646, -38.0053, -38.0962]]), 'C'), (tensor([[-23.1798, -12.2505, -11.1941,  ..., -37.8646, -38.0053, -38.0962]]), 'B'), (tensor([[-23.1798, -12.2505, -11.1941,  ..., -37.8646, -38.0053, -38.0962]]), 'A')]
Expected answer: 0  Got ans_list: ['D', 'C', 'B']
Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye?

A) 10000 times more

B) 100 times more

C) 1000 times more

D) 10 times more


Answer:


 60%|██████    | 3/5 [00:03<00:02,  1.16s/it]

[(tensor([[-24.5894, -13.0967, -11.5781,  ..., -40.4615, -40.4859, -40.5098]]), 'D'), (tensor([[-24.5894, -13.0967, -11.5781,  ..., -40.4615, -40.4859, -40.5098]]), 'C'), (tensor([[-24.5894, -13.0967, -11.5781,  ..., -40.4615, -40.4859, -40.5098]]), 'B'), (tensor([[-24.5894, -13.0967, -11.5781,  ..., -40.4615, -40.4859, -40.5098]]), 'A')]
Expected answer: 0  Got ans_list: ['D', 'C', 'B']
Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: Why isn't there a planet where the asteroid belt is located?

A) A planet once formed here but it was broken apart by a catastrophic collision.

B) There was not enough material in this part of the solar nebula to form a planet.

C) There was too much rocky material to form a terrestrial planet but not enough gaseous material to form a jovian planet.

D) Resonance with Jupiter prevented material from collecting together to form a planet.


Answer:


 80%|████████  | 4/5 [00:04<00:01,  1.07s/it]

[(tensor([[-22.1916, -13.3220, -11.0034,  ..., -37.8818, -37.9929, -38.1149]]), 'D'), (tensor([[-22.1916, -13.3220, -11.0034,  ..., -37.8818, -37.9929, -38.1149]]), 'C'), (tensor([[-22.1916, -13.3220, -11.0034,  ..., -37.8818, -37.9929, -38.1149]]), 'B'), (tensor([[-22.1916, -13.3220, -11.0034,  ..., -37.8818, -37.9929, -38.1149]]), 'A')]
Expected answer: 3  Got ans_list: ['D', 'C', 'B']
Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D]

Question: Why is Mars red?

A) Because the surface is covered with heavily oxidized ("rusted") minerals.

B) Because the atmosphere scatters more light at bluer wavelengths transmitting mostly red light.

C) Because Mars is covered with ancient lava flows which are red in color.

D) Because flowing water on Mars's surface altered the surface minerals several billion years ago.


Answer:


100%|██████████| 5/5 [00:05<00:00,  1.11s/it]

[(tensor([[-22.8309, -12.3914, -10.7021,  ..., -38.8976, -39.1221, -39.2612]]), 'D'), (tensor([[-22.8309, -12.3914, -10.7021,  ..., -38.8976, -39.1221, -39.2612]]), 'C'), (tensor([[-22.8309, -12.3914, -10.7021,  ..., -38.8976, -39.1221, -39.2612]]), 'B'), (tensor([[-22.8309, -12.3914, -10.7021,  ..., -38.8976, -39.1221, -39.2612]]), 'A')]
Expected answer: 0  Got ans_list: ['D', 'C', 'B']





### Get Precision

In [21]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)