In [None]:
from datasets import load_dataset
import os

finetuning_path ='C:/Users/user/PLUE/PLUE-main/data'
data_path = os.path.join(finetuning_path, 'policyqa')
dataset = load_dataset('json', data_files={'train': os.path.join(data_path, 'train.json'), 'test': os.path.join(data_path, 'test.json'),'dev':os.path.join(data_path,'dev.json')}, field="data")

print("Train dataset size:", len(dataset['train']))
print("Dev dataset size:", len(dataset['dev']))

print("\nTrain dataset snippet:")
for i in range(2):
    print(dataset['train'][i])

print("\nDev dataset snippet:")
for i in range(2):
    print(dataset['dev'][i])

Train dataset size: 75
Dev dataset size: 20

Train dataset snippet:
{'title': 'sidearmsports.com', 'paragraphs': [{'context': 'INFORMATION WE COLLECT ABOUT YOU When you interact with SIDEARM Services, we collect: (1) personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and (2) non-personal information transmitted through technology, including tracking information, which is also collected by third parties.', 'index': 3, 'qas': [{'answers': [{'answer_start': 90, 'text': 'personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and'}, {'answer_start': 111, 'text': 'you supply'}, {'answer_start': 90, 'text': 'personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email addr

In [None]:
from transformers import AutoTokenizer

def preprocess_function(examples):
    questions = []
    contexts = []
    answers = []
    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    questions.append(question)
                    contexts.append(context)
                    answers.append(answer)

    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

processed_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-large')

def tokenize_function(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        padding="max_length",
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    return tokenized_examples

tokenized_datasets = processed_dataset.map(tokenize_function, batched=True, remove_columns=["question", "context", "answers"])

In [None]:
import os

tokenized_datasets_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_roberta_large')
tokenized_datasets.save_to_disk(tokenized_datasets_path)

print(f"Tokenized datasets saved to: {tokenized_datasets_path}")

Saving the dataset (0/1 shards):   0%|          | 0/26861 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6354 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6044 [00:00<?, ? examples/s]

Tokenized datasets saved to: C:/Users/user/PLUE/PLUE-main/data\tokenized_datasets_policyqa_finetuning_roberta_large


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_name = 'FacebookAI/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

print(f"Loaded model: {model_name}")

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: FacebookAI/roberta-large


In [None]:
from transformers import AutoTokenizer
import os
import collections
import numpy as np

#

def prepare_train_features_with_labels(examples):
    # This function needs to process the nested structure of the dataset
    questions = []
    contexts = []
    answers = []
    question_ids = [] # Use question_ids to store the 'id' from qas

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']

                if len(qa['answers']) > 0:
                     # Take the first answer for training labels
                    answer = qa['answers'][0]
                else:
                    # Handle cases with no answers if necessary, for now setting to empty to avoid errors
                    answer = {'answer_start': -1, 'text': ''} # Placeholder for no answer

                questions.append(question)
                contexts.append(context)
                answers.append(answer)
                question_ids.append(qa['id']) # Append the question ID


    # Now tokenize the flattened lists of questions and contexts
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second", # Truncate the context
        max_length=512, # You can adjust this based on your model's max length
        stride=128, # You can adjust this
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )


    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # We keep the offsets mapping now, which will be useful later (when we want to compute the exact answer span in the original context).
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Add example_ids (which are the question IDs) to the tokenized features
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]


    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to context (the second one).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # One example can give several features, so its original answer info is stored in the example (index stored in sample_mapping).
        sample_index = sample_mapping[i]
        # Get the answer for the original example corresponding to this feature
        current_answer = answers[sample_index]


        # If no answers are given or the answer is empty, set the cls_index as answer.
        if current_answer['answer_start'] == -1 or len(current_answer["text"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the original text.
            start_char = current_answer["answer_start"]
            end_char = start_char + len(current_answer["text"])

            # Start token index of the context in the window.
            token_start_index = 0
            while sequence_ids[token_start_index] != context_index:
                token_start_index += 1

            # End token index of the context in the window.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != context_index:
                token_end_index -= 1

            # Detect if the answer is out of the span (feature does not contain the QA context answer) and set corresponding token indices to the CLS index.
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could also use the character index of the answer and then find the token index around it, but this is easier.
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    # This function needs to process the nested structure of the dataset to get question and context
    questions = []
    contexts = []
    question_ids = [] # Use question_ids to store the 'id' from qas

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id']) # Append the question ID

    # Now tokenize the flattened lists of questions and contexts
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second", # Truncate the context
        max_length=512, # You can adjust this based on your model's max length
        stride=128, # You can adjust this
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # We keep the offsets mapping now, which will be useful later (when we want to compute the exact answer span in the original context).
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    # Add example_ids (which are the question IDs) to the tokenized features
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Apply the tokenization and labeling function to the training dataset
train_tokenized_dataset = dataset['train'].map(
    prepare_train_features_with_labels,
    batched=True,
    remove_columns=dataset["train"].column_names # Remove original columns after processing
)

# Apply the validation feature preparation function to the dev dataset
dev_tokenized_dataset = dataset['dev'].map(
    prepare_validation_features,
    batched=True,
    remove_columns=dataset["dev"].column_names # Remove original columns after processing
)

# Combine the tokenized datasets back into a DatasetDict
tokenized_datasets_with_labels = {
    'train': train_tokenized_dataset,
    'dev': dev_tokenized_dataset,
    'test': dataset['test'].map( # Also apply validation features to test set if needed for future evaluation
        prepare_validation_features,
        batched=True,
        remove_columns=dataset["test"].column_names
    )
}

print("Tokenized datasets with labels and validation features created.")
print(tokenized_datasets_with_labels)

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Tokenized datasets with labels and validation features created.
{'train': Dataset({
    features: ['input_ids', 'attention_mask', 'example_id', 'start_positions', 'end_positions'],
    num_rows: 17056
}), 'dev': Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 3809
}), 'test': Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 4152
})}


In [None]:
import os
from datasets import DatasetDict # Import DatasetDict

# Define the path to save the tokenized datasets with labels
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')

# Convert the dictionary to a DatasetDict
tokenized_datasets_with_labels_dict = tokenized_datasets_with_labels
tokenized_datasets_with_labels = DatasetDict(tokenized_datasets_with_labels_dict)


# Save the tokenized datasets with labels
tokenized_datasets_with_labels.save_to_disk(tokenized_datasets_with_labels_path)

print(f"Tokenized datasets with labels saved to: {tokenized_datasets_with_labels_path}")

Saving the dataset (0/1 shards):   0%|          | 0/17056 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3809 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4152 [00:00<?, ? examples/s]

Tokenized datasets with labels saved to: C:/Users/user/PLUE/PLUE-main/data\tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels


In [None]:
original_dev_dataset = dataset['dev']


In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering
from datasets import load_from_disk
import os
import evaluate
import numpy as np



# Load the tokenized datasets with labels
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)

# We need the original dev dataset for post-processing predictions and computing metrics
original_dev_dataset = dataset['dev']


# Step 4: Set up training arguments
# Optimize for the provided hardware (1x 32GB GPU)
# A batch size of 16 or 32 should be feasible. Let's start with 16.
training_args = TrainingArguments(
    output_dir=os.path.join(finetuning_path, 'FacebookAI/roberta-large'), # Output directory for the fine-tuned model
    eval_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=5e-5, # Standard learning rate for fine-tuning
    per_device_train_batch_size=16, # Increased batch size for training
    per_device_eval_batch_size=16, # Increased batch size for evaluation
    num_train_epochs=3, # Number of training epochs (can be adjusted)
    weight_decay=0.01, # Weight decay to prevent overfitting
    push_to_hub=False, # Set to True if you want to push to the Hugging Face Hub
    report_to="none", # Disable reporting to external services
    save_steps=10000, # Save checkpoint every 10000 steps
    save_total_limit=2, # Keep only the last 2 checkpoints
    fp16=True, # Enable mixed precision training for faster training
)

# Function to compute metrics (F1 and Exact Match)
def compute_metrics(eval_pred):
    metric = evaluate.load("squad")
    start_logits, end_logits = eval_pred.predictions

    return {} # Returning empty dictionary as a temporary fix

# Step 5: Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_with_labels['train'],
    eval_dataset=tokenized_datasets_with_labels['dev'], # Evaluate on the dev set
    tokenizer=tokenizer, # Pass the tokenizer
    compute_metrics=compute_metrics, # Add the compute_metrics function
)

print("Training arguments and Trainer set up with compute_metrics.")

Training arguments and Trainer set up with compute_metrics.


  trainer = Trainer(


In [None]:
# Step 6: Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.8706,No log
2,1.8737,No log
3,1.4522,No log


TrainOutput(global_step=3198, training_loss=1.8016101963599076, metrics={'train_runtime': 811.449, 'train_samples_per_second': 63.058, 'train_steps_per_second': 3.941, 'total_flos': 4.75200793732055e+16, 'train_loss': 1.8016101963599076, 'epoch': 3.0})

In [None]:
import torch
from tqdm.auto import tqdm
import collections
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer # Import AutoTokenizer
import os

 from cell 9d44554b

# Step 8: Evaluate the fine-tuned model

# Need the original dev dataset for post-processing predictions and computing metrics
original_dev_dataset = dataset['dev']

def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = [] # Use question_ids to store the 'id' from qas

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id']) # Append the question ID

    # Now tokenize the flattened lists of questions and contexts
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second", # Truncate the context
        max_length=512, # You can adjust this based on your model's max length
        stride=128, # You can adjust this
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # For evaluation, we will need to convert our predictions to spans from the original context. Let's save the example at the token level for that.
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    # We keep the offsets mapping now, which will be useful later
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    return tokenized_examples


# Prepare the dev dataset features directly in this cell for evaluation
dev_features_dataset = original_dev_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=original_dev_dataset.column_names # Remove original columns after processing
)

# Convert dev_features_dataset to a list of dictionaries for post-processing
dev_features_list = [dev_features_dataset[i] for i in range(len(dev_features_dataset))]


# Get predictions on the dev set using the trainer
raw_predictions = trainer.predict(dev_features_dataset)


def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    # The original dataset has nested structure, we need to flatten it to map example ids (question ids)
    # to the correct original example for context and answers.
    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}


    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        # The example_id in features is the question ID
        features_per_example[feature["example_id"]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    # Let's loop over all the question IDs!
    for qid, qa_info in tqdm(qas_by_id.items()):
        # Those are the indices of the features associated to the current question ID.
        feature_indices = features_per_example[qid]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []

        context = qa_info["context"]
        # Looping through all the features associated to the current example (question).
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            # Explicitly check if 'offset_mapping' is in the feature dictionary
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 # Add debugging prints here
                 print(f"DEBUG: Examining feature at index {feature_index} for Question ID: {qid}")
                 print(f"DEBUG: Type of features: {type(features)}")
                 if isinstance(features, list):
                     print(f"DEBUG: Type of features[feature_index]: {type(features[feature_index])}")
                     if isinstance(features[feature_index], dict):
                         print(f"DEBUG: Keys in features[feature_index]: {features[feature_index].keys()}")
                     else:
                         print(f"DEBUG: features[feature_index] is not a dictionary.")
                 else:
                     print(f"DEBUG: features is not a list.")

                 continue # Skip this feature if offset_mapping is missing or None

            offset_mapping = features[feature_index]["offset_mapping"]
            # Input ids for this feature
            input_ids = features[feature_index]["input_ids"]

            # Update minimum null prediction.
            # Find the index of the CLS token.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the start and end position.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    # Don't consider predictions where the start or end token is the CLS token.
                    # Assuming CLS token is at the beginning
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    # Get the character span in the original context
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not found a single non-null prediction, we create a fake
            # prediction to avoid failure.
            best_answer = {"text": "", "score": 0.0}

        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        # For SQuAD v1.1, we always pick the best non-null answer.
        predictions[qid] = best_answer["text"]

    return predictions


# Postprocess the raw predictions
# Pass original_dev_dataset and the converted list of dev features
predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions)

# Compute and print the overall F1-score and Exact Match
metric = evaluate.load("squad")
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
references = []
for example in original_dev_dataset:
    for paragraph in example["paragraphs"]:
        for qa in paragraph["qas"]:
            # For each question, create a reference entry
            answers = qa["answers"]
            # Extract lists of text and answer_start from the list of answer dictionaries
            answer_texts = [ans["text"] for ans in answers]
            answer_starts = [ans["answer_start"] for ans in answers]
            references.append({
                "id": qa["id"],
                "answers": {
                    "text": answer_texts,
                    "answer_start": answer_starts
                }
            })


metrics = metric.compute(predictions=formatted_predictions, references=references)
print(metrics)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

{'exact_match': 35.10107639800473, 'f1': 63.28809689422713}


In [None]:
# Initialize a list to store metrics from all runs.
# Assuming the metrics from the first run are already in the 'metrics' variable
all_metrics = [metrics]

# Perform two additional training and evaluation runs
num_additional_runs = 2

for i in range(num_additional_runs):
    print(f"\nStarting training and evaluation run {i + 2}...")

    # Train the model
    trainer.train()
    print(f"Training run {i + 2} finished.")

    # Evaluate the model
    print(f"Starting evaluation after run {i + 2}...")
    raw_predictions_run = trainer.predict(dev_features_dataset)
    predictions_run = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions_run)
    formatted_predictions_run = [{"id": k, "prediction_text": v} for k, v in predictions_run.items()]
    metrics_run = metric.compute(predictions=formatted_predictions_run, references=references)
    print(f"Evaluation after run {i + 2}:", metrics_run)

    # Store the metrics
    all_metrics.append(metrics_run)


Starting training and evaluation run 2...


Epoch,Training Loss,Validation Loss
1,1.7028,No log
2,1.3518,No log
3,0.9725,No log


Training run 2 finished.
Starting evaluation after run 2...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation after run 2: {'exact_match': 33.945917563665006, 'f1': 62.21115339782965}

Starting training and evaluation run 3...


Epoch,Training Loss,Validation Loss
1,1.3186,No log
2,1.0109,No log
3,0.667,No log


Training run 3 finished.
Starting evaluation after run 3...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation after run 3: {'exact_match': 32.291940141769494, 'f1': 61.975467284372286}


In [None]:
# Calculate and display the average metrics across all three runs

# Assuming all_metrics is a list of dictionaries with 'exact_match' and 'f1' keys
avg_exact_match = np.mean([m['exact_match'] for m in all_metrics])
avg_f1 = np.mean([m['f1'] for m in all_metrics])

print("\nAverage Metrics Across Three Runs:")
print(f"Average Exact Match: {avg_exact_match:.2f}")
print(f"Average F1 Score: {avg_f1:.2f}")


Average Metrics Across Three Runs:
Average Exact Match: 33.78
Average F1 Score: 62.49


In [None]:
import numpy as np
import os
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
import evaluate
import collections
from tqdm.auto import tqdm
import torch # Import torch for device handling

# Define the learning rates to experiment with
learning_rates_to_test = [1e-5, 3e-5, 5e-5, 1e-4] # Example learning rates

# List to store results for each learning rate
learning_rate_results = []

# Load the tokenized datasets with labels and the original dev dataset
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)
original_dev_dataset = dataset['dev']

# Prepare the dev dataset features for evaluation (assuming this was done in a previous cell and is available,
# but redefining here for clarity and to ensure dependencies within this experiment block)
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    return tokenized_examples

# Ensure dev_features_dataset and dev_features_list are available
try:
    # Check if dev_features_dataset already exists from a previous run
    if 'dev_features_dataset' not in locals() or dev_features_dataset is None:
         dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )
except NameError:
     # If not, create it
     dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )

# Convert dev_features_dataset to a list of dictionaries for post-processing
dev_features_list = [dev_features_dataset[i] for i in range(len(dev_features_dataset))]


# Re-define postprocess_qa_predictions and compute_metrics to ensure they are available in this scope
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 continue

            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Load the SQuAD metric
metric = evaluate.load("squad")

# Main loop for learning rate experimentation
for lr in learning_rates_to_test:
    print(f"\nExperimenting with Learning Rate: {lr}")

    # List to store metrics for the current learning rate across 3 runs
    current_lr_metrics = []

    for run_num in range(1, 4): # Perform 3 runs for each learning rate
        print(f"--- Starting run {run_num} for LR = {lr} ---")

        # Re-initialize the model for each run to ensure a clean start
        # Assuming 'model_name' is defined from a previous cell
        try:
            if 'model_name' not in locals() or model_name is None:
                model_name = 'FacebookAI/roberta-large'
        except NameError:
             model_name = 'FacebookAI/roberta-large'

        # Assuming 'tokenizer' is available globally or from a previous cell
        try:
            if 'tokenizer' not in locals() or tokenizer is None:
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
        except NameError:
             tokenizer = AutoTokenizer.from_pretrained(model_name)

        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        # Update training arguments with the current learning rate
        # Ensure other parameters are consistent with previous training runs
        training_args = TrainingArguments(
            output_dir=os.path.join(finetuning_path, f'lr_{lr}_run_{run_num}'), # Unique output dir for each run
            eval_strategy="epoch",
            learning_rate=lr,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            push_to_hub=False,
            report_to="none",
            save_steps=10000,
            save_total_limit=1, # Keep only the last checkpoint for this run
            fp16=True,
             # compute_metrics will be handled manually after prediction
        )

        # Create a new Trainer instance for each run
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets_with_labels['train'],
            eval_dataset=tokenized_datasets_with_labels['dev'],
            tokenizer=tokenizer,
            # compute_metrics is not used here as we do manual postprocessing and evaluation
        )

        # Train the model
        trainer.train()
        print(f"--- Training run {run_num} for LR = {lr} finished. ---")

        # Evaluate the model and get predictions
        print(f"--- Starting evaluation for run {run_num} for LR = {lr} ---")
        raw_predictions = trainer.predict(dev_features_dataset)

        # Postprocess predictions and compute metrics
        predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions)

        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        # Ensure references are created only once outside the loop if they are static
        # For robustness, we can create references here, though less efficient if done repeatedly
        references = []
        for example in original_dev_dataset:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    answers = qa["answers"]
                    answer_texts = [ans["text"] for ans in answers]
                    answer_starts = [ans["answer_start"] for ans in answers]
                    references.append({
                        "id": qa["id"],
                        "answers": {
                            "text": answer_texts,
                            "answer_start": answer_starts
                        }
                    })

        metrics = metric.compute(predictions=formatted_predictions, references=references)
        print(f"Evaluation metrics for run {run_num} for LR = {lr}:", metrics)

        # Store metrics for this run
        current_lr_metrics.append(metrics)

    # Calculate average metrics for the current learning rate
    avg_exact_match = np.mean([m['exact_match'] for m in current_lr_metrics])
    avg_f1 = np.mean([m['f1'] for m in current_lr_metrics])

    print(f"\nAverage Metrics for Learning Rate {lr}:")
    print(f"Average Exact Match: {avg_exact_match:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")

    # Store the average results for this learning rate
    learning_rate_results.append({
        'learning_rate': lr,
        'average_exact_match': avg_exact_match,
        'average_f1': avg_f1
    })

# Print a summary of all learning rate experiment results
print("\n--- Summary of Learning Rate Experiment Results ---")
for result in learning_rate_results:
    print(f"LR: {result['learning_rate']}, Avg EM: {result['average_exact_match']:.2f}, Avg F1: {result['average_f1']:.2f}")


Experimenting with Learning Rate: 1e-05
--- Starting run 1 for LR = 1e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.7205,No log
2,2.2392,No log
3,2.0089,No log


--- Training run 1 for LR = 1e-05 finished. ---
--- Starting evaluation for run 1 for LR = 1e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for LR = 1e-05: {'exact_match': 33.55211341559465, 'f1': 61.16490733411565}
--- Starting run 2 for LR = 1e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6833,No log
2,2.1979,No log
3,1.9698,No log


--- Training run 2 for LR = 1e-05 finished. ---
--- Starting evaluation for run 2 for LR = 1e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for LR = 1e-05: {'exact_match': 33.68338146495143, 'f1': 61.525113233023774}
--- Starting run 3 for LR = 1e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6102,No log
2,2.1838,No log
3,1.9566,No log


--- Training run 3 for LR = 1e-05 finished. ---
--- Starting evaluation for run 3 for LR = 1e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for LR = 1e-05: {'exact_match': 33.49960619585193, 'f1': 61.825370093473914}

Average Metrics for Learning Rate 1e-05:
Average Exact Match: 33.58
Average F1 Score: 61.51

Experimenting with Learning Rate: 3e-05
--- Starting run 1 for LR = 3e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4481,No log
2,2.0118,No log
3,1.6415,No log


--- Training run 1 for LR = 3e-05 finished. ---
--- Starting evaluation for run 1 for LR = 3e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for LR = 3e-05: {'exact_match': 35.41611971646101, 'f1': 63.58476390836381}
--- Starting run 2 for LR = 3e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4989,No log
2,2.0271,No log
3,1.6563,No log


--- Training run 2 for LR = 3e-05 finished. ---
--- Starting evaluation for run 2 for LR = 3e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for LR = 3e-05: {'exact_match': 36.256235232344444, 'f1': 63.74992079506176}
--- Starting run 3 for LR = 3e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4398,No log
2,2.0351,No log
3,1.6635,No log


--- Training run 3 for LR = 3e-05 finished. ---
--- Starting evaluation for run 3 for LR = 3e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for LR = 3e-05: {'exact_match': 35.9149383040168, 'f1': 63.91770049872526}

Average Metrics for Learning Rate 3e-05:
Average Exact Match: 35.86
Average F1 Score: 63.75

Experimenting with Learning Rate: 5e-05
--- Starting run 1 for LR = 5e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6338,No log
2,2.1104,No log
3,1.6815,No log


--- Training run 1 for LR = 5e-05 finished. ---
--- Starting evaluation for run 1 for LR = 5e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for LR = 5e-05: {'exact_match': 35.83617747440273, 'f1': 63.32970929062659}
--- Starting run 2 for LR = 5e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.5146,No log
2,2.0349,No log
3,1.6016,No log


--- Training run 2 for LR = 5e-05 finished. ---
--- Starting evaluation for run 2 for LR = 5e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for LR = 5e-05: {'exact_match': 35.31110527697558, 'f1': 63.2220247196135}
--- Starting run 3 for LR = 5e-05 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.658,No log
2,2.1467,No log
3,1.7215,No log


--- Training run 3 for LR = 5e-05 finished. ---
--- Starting evaluation for run 3 for LR = 5e-05 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for LR = 5e-05: {'exact_match': 35.25859805723287, 'f1': 62.69243087755235}

Average Metrics for Learning Rate 5e-05:
Average Exact Match: 35.47
Average F1 Score: 63.08

Experimenting with Learning Rate: 0.0001
--- Starting run 1 for LR = 0.0001 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.2539,No log
2,2.4131,No log
3,1.849,No log


--- Training run 1 for LR = 0.0001 finished. ---
--- Starting evaluation for run 1 for LR = 0.0001 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for LR = 0.0001: {'exact_match': 32.97453399842478, 'f1': 60.74931068707313}
--- Starting run 2 for LR = 0.0001 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.7025,No log
2,3.0341,No log
3,2.2958,No log


--- Training run 2 for LR = 0.0001 finished. ---
--- Starting evaluation for run 2 for LR = 0.0001 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for LR = 0.0001: {'exact_match': 28.091362562352323, 'f1': 56.134521883199014}
--- Starting run 3 for LR = 0.0001 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.8926,No log
2,2.3442,No log
3,1.8074,No log


--- Training run 3 for LR = 0.0001 finished. ---
--- Starting evaluation for run 3 for LR = 0.0001 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for LR = 0.0001: {'exact_match': 33.735888684694146, 'f1': 61.85167092551343}

Average Metrics for Learning Rate 0.0001:
Average Exact Match: 31.60
Average F1 Score: 59.58

--- Summary of Learning Rate Experiment Results ---
LR: 1e-05, Avg EM: 33.58, Avg F1: 61.51
LR: 3e-05, Avg EM: 35.86, Avg F1: 63.75
LR: 5e-05, Avg EM: 35.47, Avg F1: 63.08
LR: 0.0001, Avg EM: 31.60, Avg F1: 59.58


In [None]:
import numpy as np
import os
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
import evaluate
import collections
from tqdm.auto import tqdm
import torch # Import torch for device handling

# Define the number of epochs to experiment with
epochs_to_test = [1, 2, 5, 10]

# Define the best learning rate found from the previous experiment
best_learning_rate = 3e-5

# List to store results for each epoch value
epoch_results = []

# Load the tokenized datasets with labels and the original dev dataset
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)
original_dev_dataset = dataset['dev']

# Ensure dev_features_dataset and dev_features_list are available
try:
    # Check if dev_features_dataset already exists from a previous run
    if 'dev_features_dataset' not in locals() or dev_features_dataset is None:
         dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )
except NameError:
     # If not, create it
     dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )

# Convert dev_features_dataset to a list of dictionaries for post-processing
dev_features_list = [dev_features_dataset[i] for i in range(len(dev_features_dataset))]


# Re-define postprocess_qa_predictions and compute_metrics to ensure they are available in this scope
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 continue

            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Load the SQuAD metric
metric = evaluate.load("squad")

# Main loop for epoch experimentation
for num_epochs in epochs_to_test:
    print(f"\nExperimenting with Number of Epochs: {num_epochs}")

    # List to store metrics for the current epoch value across 3 runs
    current_epoch_metrics = []

    for run_num in range(1, 4): # Perform 3 runs for each epoch value
        print(f"--- Starting run {run_num} for Epochs = {num_epochs} ---")

        # Re-initialize the model for each run to ensure a clean start
        try:
            if 'model_name' not in locals() or model_name is None:
                model_name = 'FacebookAI/roberta-large'
        except NameError:
             model_name = 'FacebookAI/roberta-large'

        try:
            if 'tokenizer' not in locals() or tokenizer is None:
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
        except NameError:
             tokenizer = AutoTokenizer.from_pretrained(model_name)


        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        # Update training arguments with the best learning rate and current number of epochs
        training_args = TrainingArguments(
            output_dir=os.path.join(finetuning_path, f'epochs_{num_epochs}_run_{run_num}'), # Unique output dir
            eval_strategy="epoch",
            learning_rate=best_learning_rate, # Use the best learning rate
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=num_epochs, # Use the current epoch value
            weight_decay=0.01,
            push_to_hub=False,
            report_to="none",
            save_steps=10000,
            save_total_limit=1,
            fp16=True,
        )

        # Create a new Trainer instance for each run
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets_with_labels['train'],
            eval_dataset=tokenized_datasets_with_labels['dev'],
            tokenizer=tokenizer,
        )

        # Train the model
        trainer.train()
        print(f"--- Training run {run_num} for Epochs = {num_epochs} finished. ---")

        # Evaluate the model and get predictions
        print(f"--- Starting evaluation for run {run_num} for Epochs = {num_epochs} ---")
        raw_predictions = trainer.predict(dev_features_dataset)

        # Postprocess predictions and compute metrics
        predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions)

        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        # Ensure references are created only once outside the loop if they are static
        references = []
        for example in original_dev_dataset:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    answers = qa["answers"]
                    answer_texts = [ans["text"] for ans in answers]
                    answer_starts = [ans["answer_start"] for ans in answers]
                    references.append({
                        "id": qa["id"],
                        "answers": {
                            "text": answer_texts,
                            "answer_start": answer_starts
                        }
                    })

        metrics = metric.compute(predictions=formatted_predictions, references=references)
        print(f"Evaluation metrics for run {run_num} for Epochs = {num_epochs}:", metrics)

        # Store metrics for this run
        current_epoch_metrics.append(metrics)

    # Calculate average metrics for the current epoch value
    avg_exact_match = np.mean([m['exact_match'] for m in current_epoch_metrics])
    avg_f1 = np.mean([m['f1'] for m in current_epoch_metrics])

    print(f"\nAverage Metrics for Number of Epochs {num_epochs}:")
    print(f"Average Exact Match: {avg_exact_match:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")

    # Store the average results for this epoch value
    epoch_results.append({
        'num_epochs': num_epochs,
        'average_exact_match': avg_exact_match,
        'average_f1': avg_f1
    })

# Print a summary of all epoch experiment results
print("\n--- Summary of Epoch Experiment Results ---")
for result in epoch_results:
    print(f"Epochs: {result['num_epochs']}, Avg EM: {result['average_exact_match']:.2f}, Avg F1: {result['average_f1']:.2f}")


Experimenting with Number of Epochs: 1
--- Starting run 1 for Epochs = 1 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.3826,No log


--- Training run 1 for Epochs = 1 finished. ---
--- Starting evaluation for run 1 for Epochs = 1 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for Epochs = 1: {'exact_match': 32.21317931215542, 'f1': 60.31015273362508}
--- Starting run 2 for Epochs = 1 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.3841,No log


--- Training run 2 for Epochs = 1 finished. ---
--- Starting evaluation for run 2 for Epochs = 1 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for Epochs = 1: {'exact_match': 32.86951955893935, 'f1': 61.112364916063825}
--- Starting run 3 for Epochs = 1 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4384,No log


--- Training run 3 for Epochs = 1 finished. ---
--- Starting evaluation for run 3 for Epochs = 1 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for Epochs = 1: {'exact_match': 31.976896823313204, 'f1': 59.75687959580547}

Average Metrics for Number of Epochs 1:
Average Exact Match: 32.35
Average F1 Score: 60.39

Experimenting with Number of Epochs: 2
--- Starting run 1 for Epochs = 2 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4535,No log
2,1.9764,No log


--- Training run 1 for Epochs = 2 finished. ---
--- Starting evaluation for run 1 for Epochs = 2 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for Epochs = 2: {'exact_match': 34.18220005250722, 'f1': 62.387755975907304}
--- Starting run 2 for Epochs = 2 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6468,No log
2,2.0839,No log


--- Training run 2 for Epochs = 2 finished. ---
--- Starting evaluation for run 2 for Epochs = 2 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for Epochs = 2: {'exact_match': 33.97217117353636, 'f1': 61.714918576595124}
--- Starting run 3 for Epochs = 2 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.5583,No log
2,2.0586,No log


--- Training run 3 for Epochs = 2 finished. ---
--- Starting evaluation for run 3 for Epochs = 2 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for Epochs = 2: {'exact_match': 34.81228668941979, 'f1': 62.7086430109591}

Average Metrics for Number of Epochs 2:
Average Exact Match: 34.32
Average F1 Score: 62.27

Experimenting with Number of Epochs: 5
--- Starting run 1 for Epochs = 5 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.5769,No log
2,2.0903,No log
3,1.765,No log
4,1.4383,No log
5,1.1765,No log


--- Training run 1 for Epochs = 5 finished. ---
--- Starting evaluation for run 1 for Epochs = 5 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for Epochs = 5: {'exact_match': 36.01995274350223, 'f1': 63.92013923800081}
--- Starting run 2 for Epochs = 5 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.5244,No log
2,2.1009,No log
3,1.7395,No log
4,1.4045,No log
5,1.1509,No log


--- Training run 2 for Epochs = 5 finished. ---
--- Starting evaluation for run 2 for Epochs = 5 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for Epochs = 5: {'exact_match': 34.129692832764505, 'f1': 63.417150746230604}
--- Starting run 3 for Epochs = 5 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.5094,No log
2,2.0909,No log
3,1.7501,No log
4,1.4235,No log
5,1.1534,No log


--- Training run 3 for Epochs = 5 finished. ---
--- Starting evaluation for run 3 for Epochs = 5 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for Epochs = 5: {'exact_match': 35.28485166710423, 'f1': 63.480855502636466}

Average Metrics for Number of Epochs 5:
Average Exact Match: 35.14
Average F1 Score: 63.61

Experimenting with Number of Epochs: 10
--- Starting run 1 for Epochs = 10 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4623,No log
2,2.0404,No log
3,1.7652,No log
4,1.4715,No log
5,1.2005,No log
6,0.9619,No log
7,0.7755,No log
8,0.6255,No log
9,0.4886,No log
10,0.3894,No log


--- Training run 1 for Epochs = 10 finished. ---
--- Starting evaluation for run 1 for Epochs = 10 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 1 for Epochs = 10: {'exact_match': 32.44946180099764, 'f1': 63.07457228072802}
--- Starting run 2 for Epochs = 10 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.466,No log
2,2.0669,No log
3,1.787,No log
4,1.483,No log
5,1.2278,No log
6,1.0253,No log
7,0.8245,No log
8,0.6713,No log
9,0.5466,No log
10,0.4425,No log


--- Training run 2 for Epochs = 10 finished. ---
--- Starting evaluation for run 2 for Epochs = 10 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 2 for Epochs = 10: {'exact_match': 34.313468101864004, 'f1': 63.42934152611029}
--- Starting run 3 for Epochs = 10 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.4889,No log
2,2.1148,No log
3,1.8177,No log
4,1.5129,No log
5,1.2509,No log
6,1.0413,No log
7,0.8386,No log
8,0.6813,No log
9,0.5667,No log
10,0.462,No log


--- Training run 3 for Epochs = 10 finished. ---
--- Starting evaluation for run 3 for Epochs = 10 ---


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation metrics for run 3 for Epochs = 10: {'exact_match': 32.47571541086899, 'f1': 62.54598389184993}

Average Metrics for Number of Epochs 10:
Average Exact Match: 33.08
Average F1 Score: 63.02

--- Summary of Epoch Experiment Results ---
Epochs: 1, Avg EM: 32.35, Avg F1: 60.39
Epochs: 2, Avg EM: 34.32, Avg F1: 62.27
Epochs: 5, Avg EM: 35.14, Avg F1: 63.61
Epochs: 10, Avg EM: 33.08, Avg F1: 63.02


In [None]:
import numpy as np
import os
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
import evaluate
import collections
from tqdm.auto import tqdm
import torch # Import torch for device handling

# Define the batch sizes to experiment with
batch_sizes_to_test = [8, 16, 32, 64]

# Define the best hyperparameters found so far
best_learning_rate = 3e-5
best_num_epochs = 5

# List to store results for each batch size value
batch_size_experiment_results = []

# Load the tokenized datasets with labels and the original dev dataset
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)
original_dev_dataset = dataset['dev']

# Ensure dev_features_dataset and dev_features_list are available
try:
    # Check if dev_features_dataset already exists from a previous run
    if 'dev_features_dataset' not in locals() or dev_features_dataset is None:
         dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )
except NameError:
     # If not, create it
     dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )

# Convert dev_features_dataset to a list of dictionaries for post-processing
dev_features_list = [dev_features_dataset[i] for i in range(len(dev_features_dataset))]


# Re-define prepare_validation_features to ensure it's available
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    return tokenized_examples


# Re-define postprocess_qa_predictions to ensure it's available
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 continue

            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Load the SQuAD metric
metric = evaluate.load("squad")


# Main loop for batch size experimentation
for batch_size in batch_sizes_to_test:
    print(f"\nExperimenting with Batch Size: {batch_size}")

    # List to store metrics for the current batch size across 3 runs
    current_batch_size_metrics = []

    for run_num in range(1, 4): # Perform 3 runs for each batch size
        print(f"--- Starting run {run_num} for Batch Size = {batch_size} ---")

        # Re-initialize the model for each run to ensure a clean start
        try:
            if 'model_name' not in locals() or model_name is None:
                model_name = 'FacebookAI/roberta-large'
        except NameError:
             model_name = 'FacebookAI/roberta-large'

        try:
            if 'tokenizer' not in locals() or tokenizer is None:
                 tokenizer = AutoTokenizer.from_pretrained(model_name)
        except NameError:
             tokenizer = AutoTokenizer.from_pretrained(model_name)

        model = AutoModelForQuestionAnswering.from_pretrained(model_name)

        # Update training arguments with the best learning rate, best epochs, and current batch size
        training_args = TrainingArguments(
            output_dir=os.path.join(finetuning_path, f'batch_size_experiment_{batch_size}_run_{run_num}'), # Unique output dir
            eval_strategy="epoch",
            learning_rate=best_learning_rate, # Use the best learning rate
            per_device_train_batch_size=batch_size, # Use the current batch size
            per_device_eval_batch_size=batch_size, # Use the current batch size
            num_train_epochs=best_num_epochs, # Use the best number of epochs
            weight_decay=0.01,
            push_to_hub=False,
            report_to="none",
            save_steps=10000,
            save_total_limit=1,
            fp16=True,
        )

        # Create a new Trainer instance for each run
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets_with_labels['train'],
            eval_dataset=tokenized_datasets_with_labels['dev'],
            tokenizer=tokenizer,
        )

        # Train the model
        # Add try-except block to catch potential CUDA out of memory errors for batch size 64
        try:
            trainer.train()
            print(f"--- Training run {run_num} for Batch Size = {batch_size} finished. ---")

            # Evaluate the model and get predictions
            print(f"--- Starting evaluation for run {run_num} for Batch Size = {batch_size} ---")
            raw_predictions = trainer.predict(dev_features_dataset)

            # Postprocess predictions and compute metrics
            predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions)

            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
            # Ensure references are created only once outside the loop if they are static
            references = []
            for example in original_dev_dataset:
                for paragraph in example["paragraphs"]:
                    for qa in paragraph["qas"]:
                        answers = qa["answers"]
                        answer_texts = [ans["text"] for ans in answers]
                        answer_starts = [ans["answer_start"] for ans in answers]
                        references.append({
                            "id": qa["id"],
                            "answers": {
                                "text": answer_texts,
                                "answer_start": answer_starts
                            }
                        })

            metrics = metric.compute(predictions=formatted_predictions, references=references)
            print(f"Evaluation metrics for run {run_num} for Batch Size = {batch_size}:", metrics)

            # Store metrics for this run
            current_batch_size_metrics.append(metrics)

        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"--- CUDA Out of Memory for Batch Size = {batch_size}, run {run_num}. Skipping this run. ---")
                # You might want to add a placeholder metric or handle this case specifically
                # For now, we just skip and the average calculation will exclude this run if it failed all 3 times
            else:
                raise e # Re-raise other runtime errors


    # Calculate average metrics for the current batch size, only if there were successful runs
    if current_batch_size_metrics:
        avg_exact_match = np.mean([m['exact_match'] for m in current_batch_size_metrics])
        avg_f1 = np.mean([m['f1'] for m in current_batch_size_metrics])

        print(f"\nAverage Metrics for Batch Size {batch_size}:")
        print(f"Average Exact Match: {avg_exact_match:.2f}")
        print(f"Average F1 Score: {avg_f1:.2f}")

        # Store the average results for this batch size value
        batch_size_experiment_results.append({
            'batch_size': batch_size,
            'average_exact_match': avg_exact_match,
            'average_f1': avg_f1
        })
    else:
        print(f"\nNo successful runs for Batch Size {batch_size} due to errors (e.g., CUDA out of memory). No average calculated.")


# Print a summary of all batch size experiment results
print("\n--- Summary of Batch Size Experiment Results ---")
for result in batch_size_experiment_results:
    print(f"Batch Size: {result['batch_size']}, Avg EM: {result['average_exact_match']:.2f}, Avg F1: {result['average_f1']:.2f}")

In [None]:
# Print the summary of batch size experiment results collected so far

print("\n--- Summary of Completed Batch Size Experiment Results ---")
for result in batch_size_experiment_results:
    print(f"Batch Size: {result['batch_size']}, Avg EM: {result['average_exact_match']:.2f}, Avg F1: {result['average_f1']:.2f}")

# Note: Results for batch sizes 32 and 64 are not included as they were skipped.


--- Summary of Completed Batch Size Experiment Results ---
Batch Size: 8, Avg EM: 33.77, Avg F1: 62.70
Batch Size: 16, Avg EM: 35.07, Avg F1: 63.57


In [None]:
import numpy as np
import os
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
import evaluate
import collections
from tqdm.auto import tqdm
import torch

# Define the best hyperparameters found so far
best_learning_rate = 3e-5
best_num_epochs = 5
base_train_batch_size = 8 # Reduced base batch size for gradient accumulation
gradient_accumulation_steps = 2 # Accumulate gradients over 2 steps (Effective batch size = 8 * 2 = 16)
eval_batch_size = 16 # Evaluation batch size remains the same
best_weight_decay = 0.01
finetuning_path ='C:/Users/user/PLUE/PLUE-main/data'

# List to store results for the gradient accumulation experiment
gradient_accumulation_experiment_results = []

# Load the tokenized datasets with labels and the original dev dataset
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)
original_dev_dataset = dataset['dev']

# Ensure dev_features_dataset and dev_features_list are available
try:
    # Check if dev_features_dataset already exists from a previous run
    if 'dev_features_dataset' not in locals() or dev_features_dataset is None:
         dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )
except NameError:
     # If not, create it
     dev_features_dataset = original_dev_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_dev_dataset.column_names
        )

# Convert dev_features_dataset to a list of dictionaries for post-processing
dev_features_list = [dev_features_dataset[i] for i in range(len(dev_features_dataset))]

# Re-define prepare_validation_features to ensure it's available
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    return tokenized_examples


# Re-define postprocess_qa_predictions to ensure it's available
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 continue

            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Load the SQuAD metric
metric = evaluate.load("squad")

# Main loop for gradient accumulation experiment (3 runs)
print(f"\nExperimenting with Gradient Accumulation: base_batch_size={base_train_batch_size}, accumulation_steps={gradient_accumulation_steps}")

for run_num in range(1, 4): # Perform 3 runs
    print(f"--- Starting run {run_num} for Gradient Accumulation ---")

    # Re-initialize the model for each run to ensure a clean start
    try:
        if 'model_name' not in locals() or model_name is None:
            model_name = 'FacebookAI/roberta-large'
    except NameError:
         model_name = 'FacebookAI/roberta-large'

    try:
        if 'tokenizer' not in locals() or tokenizer is None:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
    except NameError:
             tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForQuestionAnswering.from_pretrained(model_name)


    # Set up training arguments with gradient accumulation
    training_args = TrainingArguments(
        output_dir=os.path.join(finetuning_path, f'gradient_accumulation_experiment_run_{run_num}'), # Unique output dir
        eval_strategy="epoch",
        learning_rate=best_learning_rate,
        per_device_train_batch_size=base_train_batch_size, # Base batch size
        gradient_accumulation_steps=gradient_accumulation_steps, # Gradient accumulation
        per_device_eval_batch_size=eval_batch_size, # Evaluation batch size
        num_train_epochs=best_num_epochs, # Use the best number of epochs
        weight_decay=best_weight_decay, # Use the best weight decay
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        fp16=True,
    )

    # Create a new Trainer instance for each run
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets_with_labels['train'],
        eval_dataset=tokenized_datasets_with_labels['dev'],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()
    print(f"--- Training run {run_num} for Gradient Accumulation finished. ---")

    # Evaluate the model and get predictions
    print(f"--- Starting evaluation for run {run_num} for Gradient Accumulation ---")
    raw_predictions = trainer.predict(dev_features_dataset)

    # Postprocess predictions and compute metrics
    predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions)

    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    # Ensure references are created only once outside the loop if they are static
    references = []
    for example in original_dev_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })


    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation metrics for run {run_num} for Gradient Accumulation:", metrics)

    # Store metrics for this run
    gradient_accumulation_experiment_results.append(metrics)


# Calculate average metrics for the gradient accumulation experiment
if gradient_accumulation_experiment_results:
    avg_exact_match = np.mean([m['exact_match'] for m in gradient_accumulation_experiment_results])
    avg_f1 = np.mean([m['f1'] for m in gradient_accumulation_experiment_results])

    print(f"\nAverage Metrics for Gradient Accumulation Experiment:")
    print(f"Average Exact Match: {avg_exact_match:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")



else:
    print(f"\nNo successful runs for gradient accumulation experiment.")

# Note: The results of this experiment are directly printed above.

In [None]:
import numpy as np
import os
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
import evaluate
import collections
from tqdm.auto import tqdm
import torch
tokenizer = AutoTokenizer.from_pretrained('FacebookAI/roberta-large')

# Define the best hyperparameters for final evaluation
best_learning_rate = 3e-5
best_num_epochs = 5
best_batch_size = 16
best_weight_decay = 0.01
finetuning_path ='C:/Users/user/PLUE/PLUE-main/data'

# List to store results for the test set evaluation runs
test_set_evaluation_results = []

# Load the tokenized datasets with labels and the original test dataset
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_rpnerta_large_with_labels')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)
original_test_dataset = dataset['test'] # Use the original test dataset


# Prepare the test dataset features for evaluation (if not already done or to ensure consistency)
# Re-define prepare_validation_features to ensure it's available
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    return tokenized_examples

# Ensure test_features_dataset and test_features_list are available or create them
try:
    # Check if test_features_dataset already exists from a previous run
    if 'test_features_dataset' not in locals() or test_features_dataset is None:
         test_features_dataset = original_test_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_test_dataset.column_names
        )
except NameError:
     # If not, create it
     test_features_dataset = original_test_dataset.map(
            prepare_validation_features,
            batched=True,
            remove_columns=original_test_dataset.column_names
        )

# Convert test_features_dataset to a list of dictionaries for post-processing
test_features_list = [test_features_dataset[i] for i in range(len(test_features_dataset))]


# Re-define postprocess_qa_predictions to ensure it's available
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 continue

            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Load the SQuAD metric
metric = evaluate.load("squad")

# Main loop for test set evaluation (3 runs)
print(f"\nEvaluating on the test set with best hyperparameters: LR={best_learning_rate}, Epochs={best_num_epochs}, Batch Size={best_batch_size}, Weight Decay={best_weight_decay}")

for run_num in range(1, 4): # Perform 3 runs for test set evaluation
    print(f"--- Starting test set evaluation run {run_num} ---")

    # Re-initialize the model for each run to ensure a clean start
    try:
        if 'model_name' not in locals() or model_name is None:
            model_name = 'FacebookAI/roberta-large'
    except NameError:
         model_name = 'FacebookAI/roberta-large'

    try:
        if 'tokenizer' not in locals() or tokenizer is None:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
    except NameError:
             tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    # Set up training arguments for training on the full training data
    training_args = TrainingArguments(
        output_dir=os.path.join(finetuning_path, f'test_evaluation_run_{run_num}'), # Unique output dir
        eval_strategy="no", # No evaluation during training for final test evaluation
        learning_rate=best_learning_rate,
        per_device_train_batch_size=best_batch_size,
        per_device_eval_batch_size=best_batch_size, # Use best batch size for evaluation as well
        num_train_epochs=best_num_epochs,
        weight_decay=best_weight_decay,
        push_to_hub=False,
        report_to="none",
        save_steps=10000, # Save at intervals if needed, or set to a large number
        save_total_limit=1,
        fp16=True,
    )

    # Create a new Trainer instance for each run, training on the full train dataset
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets_with_labels['train'], # Train on the full training dataset
        # eval_dataset is not needed for the final evaluation step within the Trainer
        tokenizer=tokenizer,
    )

    # Train the model on the full training dataset
    trainer.train()
    print(f"--- Training for test set evaluation run {run_num} finished. ---")

    # Evaluate the trained model on the test set
    print(f"--- Starting test set evaluation for run {run_num} ---")
    raw_predictions_test = trainer.predict(tokenized_datasets_with_labels['test']) # Predict on the tokenized test dataset

    # Postprocess predictions and compute metrics on the original test dataset
    predictions_test = postprocess_qa_predictions(original_test_dataset, test_features_list, raw_predictions_test)

    formatted_predictions_test = [{"id": k, "prediction_text": v} for k, v in predictions_test.items()]

    # Prepare references for the original test dataset
    references_test = []
    for example in original_test_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references_test.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics_test = metric.compute(predictions=formatted_predictions_test, references=references_test)
    print(f"Test set evaluation metrics for run {run_num}:", metrics_test)

    # Store metrics for this run
    test_set_evaluation_results.append(metrics_test)


# Calculate average metrics across the three test set evaluation runs
if test_set_evaluation_results:
    avg_exact_match_test = np.mean([m['exact_match'] for m in test_set_evaluation_results])
    avg_f1_test = np.mean([m['f1'] for m in test_set_evaluation_results])

    print(f"\nAverage Metrics Across Three Test Set Evaluation Runs:")
    print(f"Average Exact Match: {avg_exact_match_test:.2f}")
    print(f"Average F1 Score: {avg_f1_test:.2f}")

else:
    print(f"\nNo successful runs for test set evaluation.")

# Finish task: Summarize the best configuration and the final test set evaluation results.
print("\n--- Final Evaluation Summary ---")
print(f"Best Hyperparameter Configuration: LR={best_learning_rate}, Epochs={best_num_epochs}, Batch Size={best_batch_size}, Weight Decay={best_weight_decay}")
if test_set_evaluation_results:
    print(f"Average Test Set Exact Match: {avg_exact_match_test:.2f}")
    print(f"Average Test Set F1 Score: {avg_f1_test:.2f}")
else:
     print("Test set evaluation did not complete successfully.")

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


Evaluating on the test set with best hyperparameters: LR=3e-05, Epochs=5, Batch Size=16, Weight Decay=0.01
--- Starting test set evaluation run 1 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,3.231
1000,2.4787
1500,2.1499
2000,2.0534
2500,1.7848
3000,1.7238
3500,1.5091
4000,1.3882
4500,1.2735
5000,1.1384


--- Training for test set evaluation run 1 finished. ---
--- Starting test set evaluation for run 1 ---


Post-processing predictions for 4152 questions from 4152 features.


  0%|          | 0/4152 [00:00<?, ?it/s]

Test set evaluation metrics for run 1: {'exact_match': 32.03275529865125, 'f1': 59.59840728536299}
--- Starting test set evaluation run 2 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,3.3409
1000,2.5058
1500,2.1596
2000,2.0364
2500,1.7669
3000,1.6977
3500,1.4873
4000,1.3951
4500,1.2742
5000,1.128


--- Training for test set evaluation run 2 finished. ---
--- Starting test set evaluation for run 2 ---


Post-processing predictions for 4152 questions from 4152 features.


  0%|          | 0/4152 [00:00<?, ?it/s]

Test set evaluation metrics for run 2: {'exact_match': 33.285163776493256, 'f1': 60.022059783329524}
--- Starting test set evaluation run 3 ---


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,3.3645
1000,2.5145
1500,2.2165
2000,2.1048
2500,1.8892
3000,1.7709
3500,1.5371
4000,1.418
4500,1.2975
5000,1.1543


--- Training for test set evaluation run 3 finished. ---
--- Starting test set evaluation for run 3 ---


Post-processing predictions for 4152 questions from 4152 features.


  0%|          | 0/4152 [00:00<?, ?it/s]

Test set evaluation metrics for run 3: {'exact_match': 32.39402697495183, 'f1': 60.01346126801244}

Average Metrics Across Three Test Set Evaluation Runs:
Average Exact Match: 32.57
Average F1 Score: 59.88

--- Final Evaluation Summary ---
Best Hyperparameter Configuration: LR=3e-05, Epochs=5, Batch Size=16, Weight Decay=0.01
Average Test Set Exact Match: 32.57
Average Test Set F1 Score: 59.88


## Load the best fine-tuned model

### Subtask:
Load the tokenizer and the fine-tuned question answering model that performed best in your previous experiments.


**Reasoning**:
The subtask requires loading the tokenizer and the fine-tuned model. I will use `AutoTokenizer` and `AutoModelForQuestionAnswering` to load them based on the pre-trained model name. The best model was fine-tuned on 'FacebookAI/roberta-large'. The fine-tuned weights are assumed to be in the default cache or will be loaded with a warning if not explicitly saved to a path.



In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import os

model_name = 'FacebookAI/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the fine-tuned model from the saved path
# Assuming the fine-tuned model was saved to the path used in previous cells
finetuning_path ='C:/Users/user/PLUE/PLUE-main/data' # Make sure this path is correct
fine_tuned_model_path = os.path.join(finetuning_path, 'test_evaluation_run_3/checkpoint-5330') # Use one of the test evaluation run paths as it contains a trained model

model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_path)


print(f"Loaded tokenizer for: {model_name}")
print(f"Loaded fine-tuned model from: {fine_tuned_model_path}")

Loaded tokenizer for: FacebookAI/roberta-large
Loaded fine-tuned model from: C:/Users/user/PLUE/PLUE-main/data\test_evaluation_run_3/checkpoint-5330


## Prepare the policy documents

### Subtask:
Load your policy documents (the original text, not the processed dataset) and split them into manageable chunks.


**Reasoning**:
Load the policy document and split it into chunks.



In [None]:
# Assuming 'dataset' is already loaded and contains train, dev, and test splits

def extract_unique_contexts(dataset):
    """Extracts unique contexts from the dataset."""
    unique_contexts = set()
    for split in dataset.values():
        for example in split:
            for paragraph in example['paragraphs']:
                unique_contexts.add(paragraph['context'])
    return list(unique_contexts)

# Extract unique contexts from the dataset
policy_chunks = extract_unique_contexts(dataset)

print(f"Extracted {len(policy_chunks)} unique policy contexts to use as chunks.")
print("\nFirst 5 unique contexts (snippets):")
for i, chunk in enumerate(policy_chunks[:5]):
    print(f"Chunk {i+1}: {chunk[:200]}...") # Print a snippet of the first 5 chunks

Extracted 3202 unique policy contexts to use as chunks.

First 5 unique contexts (snippets):
Chunk 1: The Service generally collects personally identifiable information with your specific knowledge and consent. For instance, when you enter a sweepstakes or contest, complete a survey, make a purchase, ...
Chunk 2: Members can change their personal information or update their account information at any time by linking to http://aaasmember.org and selecting Member Services. Other registrants or users may also upd...
Chunk 3: Protection of your information: To prevent unauthorized access, maintain data accuracy and ensure the appropriate use of information, we have put in place commercially reasonable physical, technical a...
Chunk 4: Parties with whom you may choose to share your User Content: Any information or content that you voluntarily disclose for posting to the Service, such as User Content, becomes available to the public,...
Chunk 5: Information shared with trusted third parties:

In [None]:
import torch

# Ensure model is in evaluation mode and on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
model.to(device)

def get_chunk_embedding(chunk, tokenizer, model, device):
    """Generates an embedding for a single text chunk."""
    # Tokenize the chunk. Use the same settings as during training/evaluation for consistency.
    inputs = tokenizer(
        chunk,
        return_tensors="pt",
        padding="max_length", # Pad to max_length
        truncation=True,      # Truncate if longer than max_length
        max_length=512        # Use the same max_length as in data preparation
    ).to(device)

    # Get the model's output. We need the hidden states to get embeddings.
    # For question answering models, we typically get the embeddings from the final hidden state
    # of the first token (CLS token) or average pooling.
    # Let's use the embedding of the CLS token as a common practice for sentence/chunk embeddings.
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

    # Get the hidden states from the last layer
    last_hidden_states = outputs.hidden_states[-1]

    # Get the embedding of the CLS token (the first token)
    cls_embedding = last_hidden_states[:, 0, :]

    # You might also consider mean pooling:
    # mean_embedding = torch.mean(last_hidden_states, dim=1)

    # For this task, let's stick with CLS token embedding
    return cls_embedding.squeeze().cpu().numpy()


# Generate embeddings for all policy chunks
policy_chunk_embeddings = []
print(f"Generating embeddings for {len(policy_chunks)} policy chunks...")
for chunk in policy_chunks:
    # Handle potential errors or empty chunks
    if chunk.strip(): # Only process non-empty chunks
        try:
            embedding = get_chunk_embedding(chunk, tokenizer, model, device)
            policy_chunk_embeddings.append(embedding)
        except Exception as e:
            print(f"Error generating embedding for a chunk: {e}")
            # Optionally, append a placeholder or skip this chunk
            # policy_chunk_embeddings.append(None) # Or skip

# Convert the list of numpy arrays to a single numpy array
policy_chunk_embeddings_np = np.array(policy_chunk_embeddings)

print(f"Generated embeddings for {len(policy_chunk_embeddings_np)} policy chunks. Shape: {policy_chunk_embeddings_np.shape}")

Generating embeddings for 3202 policy chunks...
Generated embeddings for 3202 policy chunks. Shape: (3202, 1024)


In [None]:
# Install faiss-cpu if you haven't already
!pip install faiss-cpu

import faiss
import numpy as np

# Ensure the embeddings are in a numpy array of floats
policy_chunk_embeddings_np = np.array(policy_chunk_embeddings).astype('float32')

# Get the dimension of the embeddings
embedding_dimension = policy_chunk_embeddings_np.shape[1]

# Build a FAISS index (using IndexFlatL2 for a simple L2 distance index)
index = faiss.IndexFlatL2(embedding_dimension)

# Add the embeddings to the index
index.add(policy_chunk_embeddings_np)

print(f"FAISS index created with {index.ntotal} embeddings.")

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp313-cp313-win_amd64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.11.0.post1-cp313-cp313-win_amd64.whl (14.9 MB)
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   ----- ---------------------------------- 2.1/14.9 MB 17.4 MB/s eta 0:00:01
   ----------- ---------------------------- 4.2/14.9 MB 10.5 MB/s eta 0:00:02
   ---------------------------------------- 14.9/14.9 MB 33.1 MB/s eta 0:00:00
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
FAISS index created with 3202 embeddings.


In [None]:
import torch

def retrieve_policy_chunks(query, index, policy_chunks, tokenizer, model, device, k=3):
    """
    Retrieves the most relevant policy chunks for a given query using the FAISS index.

    Args:
        query (str): The user's query.
        index (faiss.Index): The FAISS index containing policy chunk embeddings.
        policy_chunks (list): A list of the original policy text chunks.
        tokenizer: The tokenizer for the model.
        model: The fine-tuned question answering model.
        device (torch.device): The device to run the model on.
        k (int): The number of top relevant chunks to retrieve.

    Returns:
        list: A list of the top k most relevant policy chunks.
    """
    # Generate embedding for the query using the same model
    query_embedding = get_chunk_embedding(query, tokenizer, model, device)

    # Reshape the query embedding for FAISS search
    query_embedding = query_embedding.reshape(1, -1).astype('float32')

    # Search the FAISS index
    distances, indices = index.search(query_embedding, k)

    # Retrieve the corresponding policy chunks
    relevant_chunks = [policy_chunks[i] for i in indices[0]]

    print(f"Retrieved {len(relevant_chunks)} relevant chunks for the query.")

    return relevant_chunks

# Example usage (you can test this with a sample query)
# sample_query = "How do you protect my personal information?"
# retrieved_chunks = retrieve_policy_chunks(sample_query, index, policy_chunks, tokenizer, model, device)
# print("\nRetrieved chunks:")
# for i, chunk in enumerate(retrieved_chunks):
#     print(f"Chunk {i+1}: {chunk[:200]}...")

**Note:** To run the following code, you will need an OpenAI API key. Please store it securely in Colab's Secrets (under the "🔑" icon in the left sidebar) with the name `OPENAI_API_KEY`.

In [None]:
# Install the openai library if you haven't already

import os
from openai import OpenAI

# Get the OpenAI API key from Colab secrets
try:
    openai_api_key =''
    if openai_api_key is None:
        raise ValueError("OPENAI_API_KEY not found in Colab secrets.")
except Exception as e:
    print(f"Error accessing OpenAI API key: {e}")
    print("Please make sure you have stored your OpenAI API key in Colab Secrets with the name 'OPENAI_API_KEY'.")
    openai_api_key = None # Set to None if retrieval failed

# Initialize the OpenAI client
if openai_api_key:
    client = OpenAI(api_key=openai_api_key)

def generate_answer_with_gpt4(query, retrieved_chunks, model_name="gpt-4.1-nano-2025-04-14"):
    """
    Generates an answer to the query using GPT-4 based on the retrieved policy chunks.

    Args:
        query (str): The user's query.
        retrieved_chunks (list): A list of relevant policy text chunks.
        model_name (str): The name of the GPT model to use.

    Returns:
        str: The generated answer.
    """
    if not retrieved_chunks:
        return "Could not find relevant information in the policy documents to answer your question."

    # Combine the retrieved chunks into a single context string
    context = "\n\n".join(retrieved_chunks)

    try:
        # Use the Chat Completions API
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided policy document excerpts."},
                {"role": "user", "content": f"Policy excerpts:\n{context}\n\nQuestion: {query}\n\nAnswer:"}
            ],
            temperature=0.7, # You can adjust the temperature for creativity vs. focus
            max_tokens=500, # You can adjust the maximum number of tokens in the response
            # top_p=1.0, # You can adjust top_p
            # frequency_penalty=0.0, # You can adjust frequency_penalty
            # presence_penalty=0.0, # You can adjust presence_penalty
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"An error occurred during GPT-4 generation: {e}")
        return "An error occurred while generating the answer."

# Example usage (you can test this after retrieving chunks)
# sample_query = "How do you protect my personal information?"
# retrieved_chunks = retrieve_policy_chunks(sample_query, index, policy_chunks, tokenizer, model, device) # Assuming these are already retrieved
# if retrieved_chunks:
#     answer = generate_answer_with_gpt4(sample_query, retrieved_chunks)
#     print("\nGenerated Answer:")
#     print(answer)

In [None]:
def rag_qa_system(query, index, policy_chunks, tokenizer, model, device, openai_client, gpt_model_name="gpt-4.1-nano-2025-04-14", k=3):
    """
    End-to-end RAG system for question answering on policy documents.

    Args:
        query (str): The user's query.
        index (faiss.Index): The FAISS index containing policy chunk embeddings.
        policy_chunks (list): A list of the original policy text chunks.
        tokenizer: The tokenizer for the retrieval model.
        model: The fine-tuned retrieval model.
        device (torch.device): The device to run the retrieval model on.
        openai_client (OpenAI): The initialized OpenAI client.
        gpt_model_name (str): The name of the GPT model to use for generation.
        k (int): The number of top relevant chunks to retrieve.

    Returns:
        str: The generated answer from the RAG system.
    """
    # 1. Retrieve relevant policy chunks
    print("Retrieving relevant policy chunks...")
    relevant_chunks = retrieve_policy_chunks(query, index, policy_chunks, tokenizer, model, device, k=k)

    # 2. Generate answer using GPT-4 based on retrieved chunks
    print("Generating answer with GPT-4...")
    answer = generate_answer_with_gpt4(query, relevant_chunks, model_name=gpt_model_name)

    return answer

# Example usage (assuming 'index', 'policy_chunks', 'tokenizer', 'model', 'device', and 'client' are defined from previous cells)
# Make sure your OpenAI API key is set up correctly before running this.
# sample_query = "What information do you collect?"
# rag_answer = rag_qa_system(sample_query, index, policy_chunks, tokenizer, model, device, client)
# print("\nRAG System Answer:")
# print(rag_answer)

In [None]:
# Test the RAG system with a sample query
sample_query = "How can I change my personal information?"
print(f"User Query: {sample_query}")

# Assuming 'index', 'policy_chunks', 'tokenizer', 'model', 'device', and 'client' are defined from previous cells
# Make sure your OpenAI API key is set up correctly before running this.
if 'client' in locals() and client is not None:
    rag_answer = rag_qa_system(sample_query, index, policy_chunks, tokenizer, model, device, client)
    print("\nRAG System Answer:")
    print(rag_answer)
else:
    print("OpenAI client is not initialized. Please ensure your API key is set up correctly.")

User Query: How can I change my personal information?
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...

RAG System Answer:
Based on the provided policy excerpts, there is no specific information regarding how to change your personal information. To find out how to update your personal details, please refer to the full Privacy Policy or contact our customer support for assistance.


# Task
Evaluate the RAG system using the loaded model and the specified LLM ("openai/gpt-4.1-nano-2025-04-14") on the development set first, then evaluate it on the test set.

## Evaluate rag on the dev set

### Subtask:
Evaluate the performance of the current RAG system on the development dataset to get baseline metrics. This will involve running each question from the dev set through the RAG system and comparing the generated answers to the ground truth answers.


**Reasoning**:
I need to evaluate the RAG system on the development dataset. I will iterate through the dev dataset, use the RAG system to generate answers for each question, collect the predictions and ground truth answers, and then compute and print the evaluation metrics.



In [None]:
# Evaluate the RAG system on the development dataset
print("Evaluating RAG system on the development dataset...")

predictions_dev_rag = []
references_dev_rag = []

# Iterate through each example in the original_dev_dataset
for example in original_dev_dataset:
    for paragraph in example['paragraphs']:
        for qa in paragraph['qas']:
            question_id = qa['id']
            query = qa['question']

            # Generate answer using the RAG system
            # Ensure the OpenAI client is initialized
            if 'client' in locals() and client is not None:
                 rag_answer = rag_qa_system(query, index, policy_chunks, tokenizer, model, device, client)
            else:
                print("OpenAI client is not initialized. Skipping RAG generation for this question.")
                rag_answer = "" # Assign empty answer if client not initialized

            # Store the generated answer as a prediction
            predictions_dev_rag.append({
                "id": question_id,
                "prediction_text": rag_answer
            })

            # Collect the ground truth answers
            # Each question might have multiple valid answers
            answers = qa["answers"]
            answer_texts = [ans["text"] for ans in answers]
            answer_starts = [ans["answer_start"] for ans in answers]
            references_dev_rag.append({
                "id": question_id,
                "answers": {
                    "text": answer_texts,
                    "answer_start": answer_starts
                }
            })

# Compute evaluation metrics if predictions and references were collected
if predictions_dev_rag and references_dev_rag and 'metric' in locals():
    print("\nComputing evaluation metrics on the development set...")
    metrics_dev_rag = metric.compute(predictions=predictions_dev_rag, references=references_dev_rag)
    print("Development Set RAG Evaluation Metrics:", metrics_dev_rag)
elif 'metric' not in locals():
    print("SQuAD metric 'metric' is not loaded. Cannot compute evaluation metrics.")
else:
    print("No predictions or references were collected. Cannot compute evaluation metrics.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the query.
Generating answer with GPT-4...
Retrieving relevant policy chunks...
Retrieved 3 relevant chunks for the que