In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install hf_xet
!pip install trl
!pip install --upgrade transformers
!pip install tensorboard
!pip install -U bitsandbytes
!pip install transformers_stream_generator
!pip install rouge_score
!pip install sacrebleu



### Import Packages

In [3]:
import os
from datasets import load_dataset, concatenate_datasets, Dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU

### Load Datasets

In [4]:
# load tuning sets for <Sunrise on the Reaping>
sunrise_train_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/sunrise_train_data.json'
sunrise_val_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/sunrise_val_data.json'
sunrise_test_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/sunrise_test_data.json'

sunrise_train_datasets = Dataset.from_json(sunrise_train_path)
sunrise_val_datasets = Dataset.from_json(sunrise_val_path)
sunrise_test_datasets = Dataset.from_json(sunrise_test_path)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
# load tuning sets for <All Fours>
allfours_train_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning2/reasoned_qa_output/allfours_train_data.json'
allfours_val_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning2/reasoned_qa_output/allfours_val_data.json'
allfours_test_path = r'/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning2/reasoned_qa_output/allfours_test_data.json'

allfours_train_datasets = Dataset.from_json(allfours_train_path)
allfours_val_datasets = Dataset.from_json(allfours_val_path)
allfours_test_datasets = Dataset.from_json(allfours_test_path)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Combine two books together
train_set = concatenate_datasets([sunrise_train_datasets, allfours_train_datasets])
val_set = concatenate_datasets([sunrise_val_datasets, allfours_val_datasets])
test_set = concatenate_datasets([sunrise_test_datasets, allfours_test_datasets])

In [None]:
train_set

Dataset({
    features: ['Question', 'Answer', 'Reasoning'],
    num_rows: 958
})

### Model Tuning: Qwen3-8B

Load a Reasoning LLM: Qwen 3 - 8B and its specified tokeniser.

In [None]:
# Load Reasoning LLM: Qwen3-8B
qwen_model = 'Qwen/Qwen3-8B'
# Load the specified tokenizer for the model to ensure consistency
tokenizer = AutoTokenizer.from_pretrained(qwen_model)

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Quantisation Config: This is an important for QLoRA, and the key difference from LoRA.

In [None]:
# defines the quantisation settings to reduce the memory footpoint and computational cost of large models and often with minimal impact on performance
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # a key parameter which instructs the system to load the model's weights using 4-bit precision thus reducing the memory usage
    bnb_4bit_quant_type="nf4", # specifies the exact type of 4-bit quantisation to use, which is NF4. This is a specific floating-point format designed for neural networks.
    bnb_4bit_compute_dtype=torch.bfloat16, # specifies the computations involving these weights shouold be using bfloat16
    bnb_4bit_use_double_quant=False, # do not use double quantisation which quantise the quantisation constants
)

In [None]:
# load the quantised model
model = AutoModelForCausalLM.from_pretrained(
    qwen_model,
    quantization_config=bnb_config,
    device_map="auto"  # instructs the transformers library to automatically distribute the model's layer cross available decives to optimise memory usage
)
model.config.use_cache = False

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Ensure the quantised model can be properly fine tuned.

In [None]:
# this step handles the underlying complexities to make sure that even though the model's parameters have been compressed to a lower precision, gradients can be still computed and propagated to the LoRA adapters, thus enabling effective fine-tuning.
# without this step, attempting to directly fine-tune a quantised model with LoRA might lead to compatibility issues or an inability to train correctly
model = prepare_model_for_kbit_training(model)

LoRA Configuration

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

trainable params: 43,646,976 || all params: 8,234,382,336 || trainable%: 0.5301


In [None]:
def formatting_prompts_func(examples):
    texts = []
    for question, answer, reasoning_list in zip(examples["Question"], examples["Answer"], examples["Reasoning"]):
        reasoning_str = "\n".join([f"Step {i+1}: {step}" for i, step in enumerate(reasoning_list)])
        formatted_text = (
            f"### Question:\n{question}\n\n"
            f"### Thought Process:\n"
            f"{reasoning_str}\n\n"
            f"### Answer:\n{answer}"
        )
        texts.append(formatted_text)
    return {"text": texts}

In [None]:
train_dataset_formatted = train_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=train_set.column_names
)
validation_dataset_formatted = val_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=val_set.column_names
)
test_dataset_formatted = test_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=test_set.column_names
)

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_strategy="steps",
    report_to="tensorboard",
    dataloader_num_workers=0,
    max_steps=-1,
)


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_formatted,
    eval_dataset=validation_dataset_formatted,
    peft_config=lora_config,
    args=training_arguments,
)

print("\nTrainer Configuration Completed。")

Converting train dataset to ChatML:   0%|          | 0/958 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/958 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/958 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/958 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/120 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



Trainer Configuration Completed。


In [None]:
print("Model Tuning Start...")
trainer.train()
print("Model Tuning Completed")

Model Tuning Start...


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,2.7712,2.907349
20,2.7839,2.871433
30,2.6589,2.81197
40,2.6803,2.730389
50,2.7796,2.635788
60,2.5444,2.540861
70,2.5494,2.431639
80,2.2831,2.318348
90,2.3163,2.212548
100,2.1564,2.081156


  return fn(*args, **kwargs)


Model Tuning Completed


In [None]:
# Save LoAR Adapter
output_model_path = "/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/QLoRA_Qwen"
trainer.save_model(output_model_path)
print(f"Fine-tuned LoRA LoRA Adapter saved to: {output_model_path}")

### Evaluation

In [9]:
base_model = "Qwen/Qwen3-8B"
adapter_path = "/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/fine_tuning/QLoRA_Qwen"

# quantisation
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(base_model)

from peft import PeftModel

model = PeftModel.from_pretrained(model, adapter_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/32.9k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU

def evaluate_model(model, tokenizer, dataset):
    model.eval()
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bleu = BLEU()

    all_rouge_scores = {'rouge1': {'fmeasure': 0, 'precision': 0, 'recall': 0},
                        'rouge2': {'fmeasure': 0, 'precision': 0, 'recall': 0},
                        'rougeL': {'fmeasure': 0, 'precision': 0, 'recall': 0}}
    all_bleu_scores = 0
    count = 0

    for example in dataset:
        prompt = example["text"].split("### Answer:")[0] + "### Answer:"
        reference = example["text"].split("### Answer:")[1].strip()

        input_data = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = input_data.input_ids.to(model.device)
        attention_mask = input_data.attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=100,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                attention_mask = attention_mask
            )

        generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True).strip()

        rouge_scores = scorer.score(reference, generated_text)
        for key in all_rouge_scores:
            all_rouge_scores[key]['fmeasure'] += rouge_scores[key].fmeasure
            all_rouge_scores[key]['precision'] += rouge_scores[key].precision
            all_rouge_scores[key]['recall'] += rouge_scores[key].recall

        all_bleu_scores += bleu.sentence_score(generated_text, [reference]).score

        count += 1

    avg_rouge_scores = {}
    for key in all_rouge_scores:
        avg_rouge_scores[key] = {metric: score / count for metric, score in all_rouge_scores[key].items()}
    avg_bleu_score = all_bleu_scores / count

    return avg_rouge_scores, avg_bleu_score

print("\nEvaluating on Test Set...")

test_dataset_formatted = test_set.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=test_set.column_names
)
test_rouge, test_bleu = evaluate_model(model, tokenizer, test_dataset_formatted)
print("Test Set Metrics:")
print("ROUGE Scores:", test_rouge)
print("BLEU Score:", test_bleu)


Evaluating on Test Set...




Test Set Metrics:
ROUGE Scores: {'rouge1': {'fmeasure': 0.37235693681367865, 'precision': 0.38859348254507936, 'recall': 0.37136418312583697}, 'rouge2': {'fmeasure': 0.20766283783381595, 'precision': 0.21929105212020264, 'recall': 0.2041120533113388}, 'rougeL': {'fmeasure': 0.3162744816674813, 'precision': 0.3314280068067702, 'recall': 0.31397758915064305}}
BLEU Score: 14.505540775232165


In [10]:
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def chat_with_model(model, tokenizer):
    print("Start chatting with the model. Type 'quit' to exit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            break

        # Format the user input as a prompt
        prompt = f"### Question:\n{user_input}\n\n### Thought Process:\n"

        # Ensure input_ids are on the correct device
        input_data = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = input_data.input_ids.to(model.device)
        attention_mask = input_data.attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=200, # Adjust based on desired response length
                num_return_sequences=1,
                attention_mask=attention_mask,
                pad_token_id=tokenizer.eos_token_id,
                temperature=0.7, # Adjust for creativity (lower is less creative)
                top_p=0.9,       # Adjust for sampling
                do_sample=True   # Enable sampling
            )

        generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True).strip()

        # Find the start of the Answer section to only display the answer
        answer_start_index = generated_text.find("### Answer:")
        if answer_start_index != -1:
            # Find the start of the next section (if any)
            next_section_index = generated_text.find("###", answer_start_index + len("### Answer:"))
            if next_section_index != -1:
                generated_answer = generated_text[answer_start_index + len("### Answer:"):next_section_index].strip()
            else:
                generated_answer = generated_text[answer_start_index + len("### Answer:"):].strip()
            print(f"Model: {generated_answer}")
        else:
            # If no explicit Answer section, print the generated text after the Thought Process
            thought_process_end_index = generated_text.find("\n\n### Answer:")
            if thought_process_end_index != -1:
                print(f"Model: {generated_text[:thought_process_end_index].strip()}")
            else:
                 print(f"Model: {generated_text.strip()}")


# Start the chat interface
chat_with_model(model, tokenizer)

Start chatting with the model. Type 'quit' to exit.
You: What items does Haymitch give his mother and brother before leaving for the Games?
Model: Haymitch gives his mother a 'pocketknife' and his brother a 'tape measure'.
You: What is the name of the plant at Plutarch’s conservatory with a faint sweet rotten smell?
Model: The plant at Plutarch’s conservatory with a 'faint sweet rotten smell' is the 'marijuana'.
You: What item does Maysilee use to remove a spike from her cheek?
Model: Maysilee uses a 'bladed knife' to remove a spike from her cheek.
You: What is the item Maysilee knuckle-rolls on the train, and what is it made of?
Model: Maysilee knuckle-rolls a 'clear rubber tube, like a drainpipe' on the train.
You: What are the squirrel mutts’ primary movement method and coat color?
Model: The squirrel mutts primarily 'spray'  and have a 'squirrel color' coat with 'white undersides'.
You: quit


In [12]:
import json

mcq_data_path = "/content/drive/MyDrive/06 PG/01 SMU/01 Course/03 Track Elective/02 CS614 Generative AI with Large Language Models/12 Group Project/CS614-Project/mcq_evaluation/qa_data.json"

# Load the MCQ data
with open(mcq_data_path, 'r') as f:
    mcq_data = json.load(f)

In [13]:
mcq_data

[{'question': "In the novel 'All Fours', what reason does the narrator's neighbor, Brian, give in a note for his concern?",
  'options': {'A': 'He saw someone trying to break into her car.',
   'B': 'He witnessed a stranger using a telephoto lens to photograph her house.',
   'C': 'He found a suspicious package on her doorstep.',
   'D': 'He noticed a coyote lingering in her front yard.'},
  'answer': 'B'},
 {'question': "According to Harris's theory in the book, what is the primary characteristic of a 'Parker'?",
  'options': {'A': 'They enjoy long, uneventful drives and simple pleasures.',
   'B': 'They are cautious and always plan ahead.',
   'C': 'They need to perform impossible tasks and receive applause.',
   'D': 'They are introverted and prefer to stay home.'},
  'answer': 'C'},
 {'question': "What is the narrator's stated motivation for deciding to drive across the country to New York?",
  'options': {'A': "To become a more grounded and 'chill' person, like a 'Driver'.",
   'B

In [16]:
def evaluate_mcq(model, tokenizer, mcq_data):
    model.eval()
    correct_count = 0
    total_count = 0

    for item in mcq_data:
        question = item["question"]
        options = "\n".join([f"{key}. {value}" for key, value in item["options"].items()])
        correct_answer_key = item["answer"]

        prompt = f"### Question:\n{question}\n{options}\n\n### Thought Process:\n"

        input_data = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = input_data.input_ids.to(model.device)
        attention_mask = input_data.attention_mask.to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_new_tokens=200,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                attention_mask=attention_mask,
                temperature=0.1, # Keep temperature low for deterministic answers,
                do_sample=False  # Disable sampling for deterministic answers
            )

        generated_text = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True).strip()

        # Extract the predicted answer key from the generated text
        # This part might need adjustment based on the exact output format of your model
        # Look for the part after "### Answer:" and try to match it to the option keys (A, B, C, D)
        predicted_answer_key = None
        answer_start_index = generated_text.find("### Answer:")
        if answer_start_index != -1:
            answer_content = generated_text[answer_start_index + len("### Answer:"):].strip()
            # Simple heuristic: look for a single character (like A, B, C, D) at the start
            if len(answer_content) > 0 and answer_content[0] in item["options"].keys():
                predicted_answer_key = answer_content[0]
            elif len(answer_content) > 1 and answer_content[1] == '.' and answer_content[0] in item["options"].keys():
                 predicted_answer_key = answer_content[0]


        print(f"Question: {question}")
        print(f"Predicted Answer Key: {predicted_answer_key}")
        print(f"Correct Answer Key: {correct_answer_key}")


        if predicted_answer_key is not None and predicted_answer_key == correct_answer_key:
            correct_count += 1
            print("Result: Correct")
        else:
            print("Result: Incorrect")

        total_count += 1
        print("-" * 20)


    accuracy = (correct_count / total_count) * 100 if total_count > 0 else 0
    return accuracy

print("\nEvaluating on MCQ data...")
mcq_accuracy = evaluate_mcq(model, tokenizer, mcq_data)
print(f"\nMCQ Accuracy: {mcq_accuracy:.2f}%")

torch.cuda.empty_cache()

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Evaluating on MCQ data...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: In the novel 'All Fours', what reason does the narrator's neighbor, Brian, give in a note for his concern?
Predicted Answer Key: B
Correct Answer Key: B
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: According to Harris's theory in the book, what is the primary characteristic of a 'Parker'?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the narrator's stated motivation for deciding to drive across the country to New York?
Predicted Answer Key: A
Correct Answer Key: A
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Where does the narrator first meet Davey?
Predicted Answer Key: None
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does the narrator do with the $20,000 she received from a whiskey company?
Predicted Answer Key: A
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the real reason the pop star Arkanda wanted to meet with the narrator?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does Davey reveal to be his secret artistic passion?
Predicted Answer Key: D
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Who is Audra in relation to Davey?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the final agreement that the narrator and Harris come to about their marriage?
Predicted Answer Key: None
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What role does Harris play in order to have 'make-up sex' with the narrator after her trip?
Predicted Answer Key: A
Correct Answer Key: D
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Who is Kris, the person the narrator begins to date?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What event causes the final breakup between the narrator and Kris?
Predicted Answer Key: None
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: According to the narrator's father, why did his mother, Esther, commit suicide?
Predicted Answer Key: C
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What object with 'CALL ME' painted on it does the narrator leave as a message for Davey?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What surprising thing does the narrator learn about her mother's entry into menopause?
Predicted Answer Key: None
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Who does the 'telephotographer' from the beginning of the novel turn out to be?
Predicted Answer Key: C
Correct Answer Key: D
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the significance of the saluting gesture between the narrator and Harris?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: When the narrator experiences severe vertigo in her New York hotel, what does Jordi suggest is the cause?
Predicted Answer Key: None
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the name of Sam's beloved but neglectful wonder-nanny who later worked for Arkanda?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: During the narrator's stay at the Excelsior, what intimate act does Davey perform for her in the bathroom?
Predicted Answer Key: D
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: At the end of the novel, what does the narrator realize during Davey's dance performance in New York?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: After her fight with Harris, what does the narrator learn about the shared history of all the women in her family?
Predicted Answer Key: C
Correct Answer Key: D
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the 'Third Thing,' according to Jordi's explanation of a Quaker concept?
Predicted Answer Key: None
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What item does the narrator buy back from Goodwill that she had previously donated?
Predicted Answer Key: A
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: In the end, who completes the star-patterned tile floor in room 321?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What gift does Haymitch's girlfriend, Lenore Dove, give him for his sixteenth birthday?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: How does Haymitch Abernathy become a tribute in the 50th Hunger Games?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What tragic event occurs during the chariot parade at the opening ceremonies?
Predicted Answer Key: D
Correct Answer Key: D
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Who are the two mentors assigned to the District 12 tributes?
Predicted Answer Key: B
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the name the alliance of non-Career tributes gives themselves?
Predicted Answer Key: None
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: How does Haymitch discover that the water and plants in the arena are poisonous?
Predicted Answer Key: B
Correct Answer Key: B
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the primary weapon Maysilee Donner uses effectively in the arena?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What special feature does Haymitch discover at the edge of the arena beyond the woods?
Predicted Answer Key: None
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: How is Silka, the female tribute from District 1, ultimately killed?
Predicted Answer Key: D
Correct Answer Key: D
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What happens to Haymitch's family while he is in the Hunger Games?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: How does Lenore Dove die?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the secret rebel plan that Beetee shares with Haymitch?
Predicted Answer Key: None
Correct Answer Key: D
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What nickname do the Newcomers decide to call the Career tributes?
Predicted Answer Key: None
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: How does Maysilee Donner die in the arena?
Predicted Answer Key: B
Correct Answer Key: B
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the first thing President Snow says to Haymitch after he wins the Games?
Predicted Answer Key: None
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the overall design or shape of the 50th Hunger Games arena?
Predicted Answer Key: None
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: Who provides the District 12 tributes with proper outfits for the interviews after their stylist, Magno Stift, fails to do so?
Predicted Answer Key: A
Correct Answer Key: D
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does Haymitch do with the pitcher of milk he is supposed to give to a sick President Snow?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does Haymitch realize about the beautiful landscape of the arena?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does Plutarch Heavensbee tell Haymitch about the arena's sun?
Predicted Answer Key: None
Correct Answer Key: C
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does Haymitch throw at the force field in his final move against Silka?
Predicted Answer Key: A
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: After the Games, what does Haymitch see on a wall in a back alley that confirms Lenore Dove's rebellious activities?
Predicted Answer Key: C
Correct Answer Key: C
Result: Correct
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What does Maysilee use to create a makeshift glue to repair Kerna's broken sunflower token?
Predicted Answer Key: A
Correct Answer Key: B
Result: Incorrect
--------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Question: What is the first muttation Haymitch successfully fights off using his flint striker and a gas plant?
Predicted Answer Key: A
Correct Answer Key: C
Result: Incorrect
--------------------
Question: What is the final 'poster' Haymitch creates in the arena?
Predicted Answer Key: C
Correct Answer Key: D
Result: Incorrect
--------------------

MCQ Accuracy: 34.00%
