# Hands-on: Setting Closed Book Baseline

## Installation

In [None]:
!pip install datasets evaluate transformers accelerate bitsandbytes sentencepiece

## Imports

In [None]:
import random, torch, evaluate, json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
login(token='')

## Load Data

In [None]:
ds = load_dataset("hotpot_qa", "distractor", split="train[:200]")
questions = ds["question"][:25]
gold_answers = ds["answer"][:25]

In [None]:
ds

In [None]:
questions[0]

In [None]:
gold_answers[0]

In [None]:
ds["supporting_facts"][0]

In [None]:
ds["context"][0]

## Load Model

In [None]:
#model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_name  = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForCausalLM.from_pretrained(
                 model_name, device_map="auto", torch_dtype=torch.float16)
model.generation_config.pad_token_id = tokenizer.pad_token_id

generator = pipeline("text-generation", model=model, tokenizer=tokenizer,
                     temperature=0.1,
                     max_new_tokens=128)

## Closed-book LLM Output

In [None]:
predictions = []

for q in questions:
    prompt = ( "You are an expert question-answering system.\n"
               f"Question: {q}\n"
               "Answer briefly:\n" )
    ans = generator(prompt)[0]["generated_text"].split("Answer briefly:\n")[-1]
    print(f"{q} -> {ans}\n")
    predictions.append(ans.strip())


In [None]:
for i, (q, ans) in enumerate(zip(questions, predictions)):

    print(f"{i}. Question: {q}")
    print(f"Generated Answer: {ans}")
    print(f"Actual Answer: {gold_answers[i]}")
    print("-"*25)

In [None]:
predictions_formatted = []
references_formatted = []

for i, (pred, ref) in enumerate(zip(predictions, gold_answers)):
    predictions_formatted.append({"id": str(i), "prediction_text": pred})
    references_formatted.append({"id": str(i), "answers": {"text": [ref], "answer_start": [0]}})
squad = evaluate.load("squad")
results = squad.compute(predictions=predictions_formatted, references=references_formatted)
print(json.dumps(results, indent=2))

**Result Interpretation**

    {
        
    "exact_match": 4.0,

    "f1": 9.655108219663825

    }

- **EM ~ 4%** means the model answered verbatim correctly only 1 out of 25 times.

- Plenty of room for improvement => Motivation for retrieval

### Summary
- Closed-book LLMs are powerful pattern recognisers but brittle knowledge bases.
- Retrieval-Augmented Generation separates knowledge storage (the index) from reasoning (the generator).
- Even a tiny empirical test shows large headroom for improvement once retrieval is added.