# SWAG

Will start with the SWAG dataset to get a baseline before moving on to the HellaSwag dataset.

In [11]:
from unittest.mock import patch
from datasets import load_dataset

with patch("datasets.builder.is_remote_filesystem", return_value=False):
    dataset = load_dataset("swag")

In [92]:
from dataclasses import dataclass

NUM_ANSWERS = 4
CHOICES = ["A", "B", "C", "D"]


@dataclass
class Sample:
    """A sample multiple choice question."""

    question: str
    answers: list[str]
    label: int

    @property
    def choices(self) -> list[str]:
        """Return the multiple choice answers."""
        return [ f"{choice}) {option}" for choice, option in zip(CHOICES, self.answers) ]

    @property
    def label_text(self) -> str:
        """Returns the expected correct answer."""
        return self.answers[self.label]

    @property
    def expected(self) -> str:
        """Returns the expected correct answer."""
        return f"{CHOICES[self.label]}) {self.label_text}"


def make_sample(record: dict[str, str]) -> Sample:
    return Sample(record["startphrase"], [record[f"ending{i}"] for i in range(0, NUM_ANSWERS)], label=record["label"])

ds = dataset['train']
it = iter(ds)
make_sample(next(it)), make_sample(next(it)), make_sample(next(it))


(Sample(question='Members of the procession walk down the street holding small horn brass instruments. A drum line', answers=['passes by walking down the street playing their instruments.', 'has heard approaching them.', "arrives and they're outside dancing and asleep.", 'turns the lead singer watches the performance.'], label=0),
 Sample(question='A drum line passes by walking down the street playing their instruments. Members of the procession', answers=['are playing ping pong and celebrating one left each in quick.', 'wait slowly towards the cadets.', 'continues to play as well along the crowd along with the band being interviewed.', 'continue to play marching, interspersed.'], label=3),
 Sample(question='A group of members in green uniforms walks waving flags. Members of the procession', answers=['pay the other coaches to cheer as people this chatter dips in lawn sheets.', 'walk down the street holding small horn brass instruments.', 'is seen in the background.', 'are talking a cou

# Open AI Client

This will open a client library to talk to a local model server hosting the model we wish to eval.

In [10]:
from openai import OpenAI

BASE_URL = "http://llama-cpp-server.llama:8000/v1"
API_KEY = "sk-xxx"
MODEL_ID = "/data/models/mistral-7b-inst.gguf"

client = OpenAI(base_url=BASE_URL, api_key=API_KEY)
response = client.chat.completions.create(
    model=MODEL_ID,
    messages=[
        {
            "content": "You are a helpful assistant.",
            "role": "system"
        },
        {
            "content": "What is the capital of France?",
            "role": "user"
        }
    ],
)
for choice in response.choices:
    print(choice.message.content)

 The capital city of France is Paris.


In [76]:
INSTRUCTIONS = "Choose the most plausible continuation for the story."
HEADER = "Please answer with the letter of the correct answer."


def multiple_choice_prompt(sample: Sample) -> str:
    """Make a prompt from a sample."""
    return "\n".join([
        HEADER,
        "",
        sample.question,
        "",
        *sample.choices
    ])


def answer(client: OpenAI, sample: Sample) -> str:
    prompt = multiple_choice_prompt(sample)
    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=[
            {
                "content": INSTRUCTIONS,
                "role": "system"
            },
            {
                "content": prompt,
                "role": "user"
            }
        ],
    )
    choice = response.choices[0]
    return choice.message.content.strip()


sample = make_sample(next(it))
response = answer(client, sample)
sample, response

(Sample(question='A drum line passes by walking down the street playing their instruments. Members of the procession', answers=['are playing ping pong and celebrating one left each in quick.', 'wait slowly towards the cadets.', 'makes a square call and ends by jumping down into snowy streets where fans begin to take their positions.', 'play and go back and forth hitting the drums while the audience claps for them.'], label=3),
 'The most plausible continuation for the story would be D) Play and go back and forth hitting the drums while the audience claps for them.')

In [94]:
from tqdm import tqdm

correct = 0
total = 0
it = iter(ds)

In [95]:
pbar = tqdm(enumerate(it))
for i, record in pbar:
    sample = make_sample(next(it))
    response = answer(client, sample)
    total += 1
    # Sometimes the model may answer something like 'The best answer is A) <answer>'
    # so we'll give it credit.
    if response == sample.expected or sample.label_text in response:
        correct += 1
    else:
        print(response)
        print(sample.expected)

    accuracy = (correct / total) * 100
    pbar.set_description(f"{accuracy:2.2f}% - {correct} of {total}")


0.00% - 0 of 1: : 1it [00:16, 16.36s/it]

C) continues to play as well along the crowd along with the band being interviewed.
D) continue to play marching, interspersed.


60.00% - 3 of 5: : 5it [01:07, 14.29s/it]

Without additional context or information about the story, it is difficult to determine the most plausible continuation. Therefore, all of the options (A, B, C, and D) could be possible continuations depending on the context. Could you please provide more information or details about the story so that I can better assist you?
D) turns on a monitor.


50.00% - 3 of 6: : 6it [01:22, 14.37s/it]

B) rush for the unconscious mother.
D) turned to look at someone as he approaches.


50.00% - 4 of 8: : 8it [01:47, 13.58s/it]

The most plausible continuation for the story is D) rolls up his fast run from the water and tosses in the sky.
C) falls to the ground.


44.44% - 4 of 9: : 9it [02:00, 13.23s/it]

D) lights his walking stick as he aims his pistol at someone's wand.
B) leans back to her surroundings, his eyes rimmed with tears.


50.00% - 6 of 12: : 12it [02:30, 10.93s/it]

D) stops the little boy inside.
C) looks up at him.
