In [None]:
from datasets import load_dataset
import os

finetuning_path ='C:/Users/user/PLUE/PLUE-main/data'
data_path = os.path.join(finetuning_path, 'policyqa')
dataset = load_dataset('json', data_files={'train': os.path.join(data_path, 'train.json'), 'test': os.path.join(data_path, 'test.json'),'dev':os.path.join(data_path,'dev.json')}, field="data")

print("Train dataset size:", len(dataset['train']))
print("Dev dataset size:", len(dataset['dev']))

print("\nTrain dataset snippet:")
for i in range(2):
    print(dataset['train'][i])

print("\nDev dataset snippet:")
for i in range(2):
    print(dataset['dev'][i])

Train dataset size: 75
Dev dataset size: 20

Train dataset snippet:
{'title': 'sidearmsports.com', 'paragraphs': [{'context': 'INFORMATION WE COLLECT ABOUT YOU When you interact with SIDEARM Services, we collect: (1) personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and (2) non-personal information transmitted through technology, including tracking information, which is also collected by third parties.', 'index': 3, 'qas': [{'answers': [{'answer_start': 90, 'text': 'personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and'}, {'answer_start': 111, 'text': 'you supply'}, {'answer_start': 90, 'text': 'personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email addr

In [None]:
from transformers import AutoTokenizer

def preprocess_function(examples):
    questions = []
    contexts = []
    answers = []
    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    questions.append(question)
                    contexts.append(context)
                    answers.append(answer)

    return {
        'question': questions,
        'context': contexts,
        'answers': answers
    }

processed_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)


tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-small-uncased')

def tokenize_function(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        padding="max_length",
        truncation=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    return tokenized_examples

tokenized_datasets = processed_dataset.map(tokenize_function, batched=True, remove_columns=["question", "context", "answers"])

Map:   0%|          | 0/6044 [00:00<?, ? examples/s]

In [None]:
import os

tokenized_datasets_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small')
tokenized_datasets.save_to_disk(tokenized_datasets_path)

print(f"Tokenized datasets saved to: {tokenized_datasets_path}")

Saving the dataset (0/1 shards):   0%|          | 0/26861 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6354 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6044 [00:00<?, ? examples/s]

Tokenized datasets saved to: C:/Users/user/PLUE/PLUE-main/data\tokenized_datasets_policyqa_finetuning_legalbert_small


# Task
Evaluate the fine-tuned PolicyQA model on the dev set, calculating F1 and Exact Match. Then, analyze the incorrect predictions to identify patterns and display some examples of the errors.

## Evaluate on dev set

### Subtask:
Evaluate the fine-tuned model on the preprocessed `dev` set of the PolicyQA dataset, calculating the F1-score and Exact Match.


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_name = 'nlpaueb/legal-bert-small-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

print(f"Loaded model: {model_name}")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model: nlpaueb/legal-bert-small-uncased


In [None]:
from transformers import AutoTokenizer
import os
import collections
import numpy as np

# Assuming 'dataset' is already loaded from cell 1a33e96e (corrected from 1a33e97c based on the latest notebook state)
# Assuming 'tokenizer' is already loaded from cell 9d44554b

def prepare_train_features_with_labels(examples):
    # This function needs to process the nested structure of the dataset
    questions = []
    contexts = []
    answers = []
    question_ids = [] # Use question_ids to store the 'id' from qas

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                # We need to handle the case where there might be multiple answers or no answers
                # For training, we'll take the first answer provided in the original data
                # For evaluation, we'll need all answers for metric computation
                if len(qa['answers']) > 0:
                     # Take the first answer for training labels
                    answer = qa['answers'][0]
                else:
                    # Handle cases with no answers if necessary, for now setting to empty to avoid errors
                    answer = {'answer_start': -1, 'text': ''} # Placeholder for no answer

                questions.append(question)
                contexts.append(context)
                answers.append(answer)
                question_ids.append(qa['id']) # Append the question ID


    # Now tokenize the flattened lists of questions and contexts
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second", # Truncate the context
        max_length=512, # You can adjust this based on your model's max length
        stride=128, # You can adjust this
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # We keep the offsets mapping now, which will be useful later (when we want to compute the exact answer span in the original context).
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Add example_ids (which are the question IDs) to the tokenized features
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]


    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to context (the second one).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # One example can give several features, so its original answer info is stored in the example (index stored in sample_mapping).
        sample_index = sample_mapping[i]
        # Get the answer for the original example corresponding to this feature
        current_answer = answers[sample_index]


        # If no answers are given or the answer is empty, set the cls_index as answer.
        if current_answer['answer_start'] == -1 or len(current_answer["text"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the original text.
            start_char = current_answer["answer_start"]
            end_char = start_char + len(current_answer["text"])

            # Start token index of the context in the window.
            token_start_index = 0
            while sequence_ids[token_start_index] != context_index:
                token_start_index += 1

            # End token index of the context in the window.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != context_index:
                token_end_index -= 1

            # Detect if the answer is out of the span (feature does not contain the QA context answer) and set corresponding token indices to the CLS index.
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could also use the character index of the answer and then find the token index around it, but this is easier.
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    # This function needs to process the nested structure of the dataset to get question and context
    questions = []
    contexts = []
    question_ids = [] # Use question_ids to store the 'id' from qas

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id']) # Append the question ID

    # Now tokenize the flattened lists of questions and contexts
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second", # Truncate the context
        max_length=512, # You can adjust this based on your model's max length
        stride=128, # You can adjust this
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # We keep the offsets mapping now, which will be useful later (when we want to compute the exact answer span in the original context).
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    # Add example_ids (which are the question IDs) to the tokenized features
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Apply the tokenization and labeling function to the training dataset
train_tokenized_dataset = dataset['train'].map(
    prepare_train_features_with_labels,
    batched=True,
    remove_columns=dataset["train"].column_names # Remove original columns after processing
)

# Apply the validation feature preparation function to the dev dataset
dev_tokenized_dataset = dataset['dev'].map(
    prepare_validation_features,
    batched=True,
    remove_columns=dataset["dev"].column_names # Remove original columns after processing
)

# Combine the tokenized datasets back into a DatasetDict
tokenized_datasets_with_labels = {
    'train': train_tokenized_dataset,
    'dev': dev_tokenized_dataset,
    'test': dataset['test'].map( # Also apply validation features to test set if needed for future evaluation
        prepare_validation_features,
        batched=True,
        remove_columns=dataset["test"].column_names
    )
}

print("Tokenized datasets with labels and validation features created.")
print(tokenized_datasets_with_labels)

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenized datasets with labels and validation features created.
{'train': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'example_id', 'start_positions', 'end_positions'],
    num_rows: 17056
}), 'dev': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 3809
}), 'test': Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 4152
})}


In [None]:
import os

# Define the path to save the tokenized datasets with labels
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels_fixed')

# Save the tokenized datasets with labels
tokenized_datasets_with_labels.save_to_disk(tokenized_datasets_with_labels_path)

print(f"Tokenized datasets with labels saved to: {tokenized_datasets_with_labels_path}")

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering
from datasets import load_from_disk
import os
import evaluate
import numpy as np

# Assuming 'model' and 'finetuning_path' are already defined from previous cells
# Assuming 'tokenizer' is already defined from a previous cell (e.g., 9d44554b)
# Assuming 'dataset' is already loaded from a previous cell (e.g., 1a33e97c)


# Load the tokenized datasets with labels
tokenized_datasets_with_labels_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels_fixed')
tokenized_datasets_with_labels = load_from_disk(tokenized_datasets_with_labels_path)

# We need the original dev dataset for post-processing predictions and computing metrics
original_dev_dataset = dataset['dev']


# Step 4: Set up training arguments
# Optimize for the provided hardware (1x 32GB GPU)
# A batch size of 16 or 32 should be feasible. Let's start with 16.
training_args = TrainingArguments(
    output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned'), # Output directory for the fine-tuned model
    eval_strategy="epoch", # Evaluate at the end of each epoch
    learning_rate=5e-5, # Standard learning rate for fine-tuning
    per_device_train_batch_size=16, # Increased batch size for training
    per_device_eval_batch_size=16, # Increased batch size for evaluation
    num_train_epochs=3, # Number of training epochs (can be adjusted)
    weight_decay=0.01, # Weight decay to prevent overfitting
    push_to_hub=False, # Set to True if you want to push to the Hugging Face Hub
    report_to="none", # Disable reporting to external services
    save_steps=10000, # Save checkpoint every 10000 steps
    save_total_limit=2, # Keep only the last 2 checkpoints
    # load_best_model_at_last_step=True, # Load the best model based on evaluation metrics # Removed due to TypeError
    metric_for_best_model="eval_loss", # Use F1 score to determine the best model
    greater_is_better=True, # Higher F1 is better
    fp16=True, # Enable mixed precision training for faster training
)

# Function to compute metrics (F1 and Exact Match)
def compute_metrics(eval_pred):
    metric = evaluate.load("squad")
    start_logits, end_logits = eval_pred.predictions

    # Returning empty dictionary or dummy values to make Trainer happy temporarily
    return {} # Returning empty dictionary as a temporary fix

# Step 5: Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets_with_labels['train'],
    eval_dataset=tokenized_datasets_with_labels['dev'], # Evaluate on the dev set
    tokenizer=tokenizer, # Pass the tokenizer
    compute_metrics=compute_metrics, # Add the compute_metrics function
)

print("Training arguments and Trainer set up with compute_metrics.")

Training arguments and Trainer set up with compute_metrics.


  trainer = Trainer(


In [None]:
# Step 6: Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3253,4.671198
2,0.5349,4.440094
3,0.9649,3.58637


TrainOutput(global_step=3198, training_loss=0.5691658101132544, metrics={'train_runtime': 207.6918, 'train_samples_per_second': 246.365, 'train_steps_per_second': 15.398, 'total_flos': 2973425668521984.0, 'train_loss': 0.5691658101132544, 'epoch': 3.0})

In [None]:
import torch
from tqdm.auto import tqdm
import collections
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoTokenizer # Import AutoTokenizer
import os



# Step 8: Evaluate the fine-tuned model

# Need the original dev dataset for post-processing predictions and computing metrics
original_dev_dataset = dataset['dev']

# Re-define prepare_validation_features to ensure it's available and correctly used in this cell
# This function needs to process the nested structure of the dataset to get question and context
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = [] # Use question_ids to store the 'id' from qas

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id']) # Append the question ID

    # Now tokenize the flattened lists of questions and contexts
    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second", # Truncate the context
        max_length=512, # You can adjust this based on your model's max length
        stride=128, # You can adjust this
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # For evaluation, we will need to convert our predictions to spans from the original context. Let's save the example at the token level for that.
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    # We keep the offsets mapping now, which will be useful later
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")

    return tokenized_examples


# Prepare the dev dataset features directly in this cell for evaluation
dev_features_dataset = original_dev_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=original_dev_dataset.column_names # Remove original columns after processing
)

# Convert dev_features_dataset to a list of dictionaries for post-processing
dev_features_list = [dev_features_dataset[i] for i in range(len(dev_features_dataset))]


# Get predictions on the dev set using the trainer
raw_predictions = trainer.predict(dev_features_dataset)


def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    # The original dataset has nested structure, we need to flatten it to map example ids (question ids)
    # to the correct original example for context and answers.
    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}


    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        # The example_id in features is the question ID
        features_per_example[feature["example_id"]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    # Let's loop over all the question IDs!
    for qid, qa_info in tqdm(qas_by_id.items()):
        # Those are the indices of the features associated to the current question ID.
        feature_indices = features_per_example[qid]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []

        context = qa_info["context"]
        # Looping through all the features associated to the current example (question).
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            # Explicitly check if 'offset_mapping' is in the feature dictionary
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 print(f"Warning: 'offset_mapping' missing or None for feature index {feature_index} (Question ID: {qid}). Skipping feature.")
                 # Add debugging prints here
                 print(f"DEBUG: Examining feature at index {feature_index} for Question ID: {qid}")
                 print(f"DEBUG: Type of features: {type(features)}")
                 if isinstance(features, list):
                     print(f"DEBUG: Type of features[feature_index]: {type(features[feature_index])}")
                     if isinstance(features[feature_index], dict):
                         print(f"DEBUG: Keys in features[feature_index]: {features[feature_index].keys()}")
                     else:
                         print(f"DEBUG: features[feature_index] is not a dictionary.")
                 else:
                     print(f"DEBUG: features is not a list.")

                 continue # Skip this feature if offset_mapping is missing or None

            offset_mapping = features[feature_index]["offset_mapping"]
            # Input ids for this feature
            input_ids = features[feature_index]["input_ids"]

            # Update minimum null prediction.
            # Find the index of the CLS token.
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the start and end position.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    # Don't consider predictions where the start or end token is the CLS token.
                    # Assuming CLS token is at the beginning
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    # Get the character span in the original context
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not found a single non-null prediction, we create a fake
            # prediction to avoid failure.
            best_answer = {"text": "", "score": 0.0}

        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        # For SQuAD v1.1, we always pick the best non-null answer.
        predictions[qid] = best_answer["text"]

    return predictions


# Postprocess the raw predictions
# Pass original_dev_dataset and the converted list of dev features
predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list, raw_predictions)

# Compute and print the overall F1-score and Exact Match
metric = evaluate.load("squad")
formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
references = []
for example in original_dev_dataset:
    for paragraph in example["paragraphs"]:
        for qa in paragraph["qas"]:
            # For each question, create a reference entry
            answers = qa["answers"]
            # Extract lists of text and answer_start from the list of answer dictionaries
            answer_texts = [ans["text"] for ans in answers]
            answer_starts = [ans["answer_start"] for ans in answers]
            references.append({
                "id": qa["id"],
                "answers": {
                    "text": answer_texts,
                    "answer_start": answer_starts
                }
            })


metrics = metric.compute(predictions=formatted_predictions, references=references)
print(metrics)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

{'exact_match': 26.43738514045681, 'f1': 55.35870308817166}


In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk



# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation (needed for all runs)
# We re-prepare this each time to ensure consistency, using the function defined in cell 37be750c
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=original_dev_dataset.column_names # Remove original columns after processing
)


# Collect results from all runs
all_results = []

# Add the results from the first run provided by the user
first_run_metrics = {'exact_match': 26.43738514045681, 'f1': 55.35870308817166}
all_results.append(first_run_metrics)
print(f"Results from Run 1: {first_run_metrics}")


# Repeat the fine-tuning and evaluation for two more runs
for run_number in range(2, 4): # Runs 2 and 3
    # Load a fresh model for each run to avoid carrying over weights from previous runs
    model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

    # Run fine-tuning and evaluation
    metrics = run_finetuning_and_evaluation(
        run_number=run_number,
        model=model,
        train_dataset=train_dataset_with_labels,
        eval_dataset=dev_features_dataset, # Use the prepared dev features
        original_eval_dataset=original_dev_dataset, # Use the original dev dataset for references
        tokenizer=tokenizer,
        finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_repeated_runs') # Use a different output directory for repeated runs
    )
    all_results.append(metrics)


# Calculate and display average metrics
avg_exact_match = np.mean([result['exact_match'] for result in all_results])
avg_f1 = np.mean([result['f1'] for result in all_results])

print("\n--- Average Evaluation Metrics Across 3 Runs ---")
print(f"Average Exact Match: {avg_exact_match:.2f}")
print(f"Average F1 Score: {avg_f1:.2f}")

Results from Run 1: {'exact_match': 26.43738514045681, 'f1': 55.35870308817166}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 ---
Training model...


Step,Training Loss
500,3.582
1000,3.2213
1500,2.821
2000,2.5917
2500,2.3498
3000,2.2644


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 26.17484904174324, 'f1': 53.03471851658242}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 ---
Training model...


Step,Training Loss
500,3.5813
1000,3.0902
1500,2.6082
2000,2.4349
2500,2.2185
3000,2.1252


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 27.35626148595432, 'f1': 55.91109631170794}

--- Average Evaluation Metrics Across 3 Runs ---
Average Exact Match: 26.66
Average F1 Score: 54.77


In [None]:
import os
from transformers import AutoModelForQuestionAnswering

# Assuming 'all_results' is populated from cell c7bd33be
# Assuming 'finetuning_path' is defined

# Find the run with the best F1 score
best_f1 = -1.0
best_run_number = -1
for i, result in enumerate(all_results):
    if result['f1'] > best_f1:
        best_f1 = result['f1']
        # The runs are indexed from 0 in the list but numbered from 1 in the plan/output
        best_run_number = i + 1

print(f"Best F1 score ({best_f1:.2f}) achieved in Run {best_run_number}")

# Define the path to the checkpoint of the best performing model
# Assuming the checkpoints are saved in subdirectories like 'run_1', 'run_2', etc.
# And assuming the Trainer saves a checkpoint, we'll try to load the last checkpoint from that run
best_model_checkpoint_dir = os.path.join(
    finetuning_path,
    'legal-bert-small-uncased-qa-finetuned_repeated_runs',
    f'run_{best_run_number}'
    # The Trainer might save checkpoints in a further subdirectory like 'checkpoint-XXXX'
    # We need to find the actual checkpoint directory within the run directory.
    # Let's list the contents of the run directory and find the checkpoint folder.
)

# Find the actual checkpoint directory within the best run's directory
checkpoint_dirs = [d for d in os.listdir(best_model_checkpoint_dir) if os.path.isdir(os.path.join(best_model_checkpoint_dir, d)) and d.startswith("checkpoint-")]

if not checkpoint_dirs:
    print(f"Error: No checkpoint directories found in {best_model_checkpoint_dir}")
else:
    # Assuming we want the latest checkpoint if multiple exist within the run directory
    latest_checkpoint_dir = max(checkpoint_dirs, key=lambda d: int(d.split("-")[-1]))
    best_model_path_to_load = os.path.join(best_model_checkpoint_dir, latest_checkpoint_dir)

    print(f"Loading model from: {best_model_path_to_load}")

    # Load the best performing model
    best_model = AutoModelForQuestionAnswering.from_pretrained(best_model_path_to_load)

    # Define the path to save the overall best model
    overall_best_model_save_path = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_best')
    print(f"Saving the overall best model to: {overall_best_model_save_path}")

    # Save the overall best model
    best_model.save_pretrained(overall_best_model_save_path)

    print("Overall best model saved.")

Best F1 score (55.91) achieved in Run 3
Loading model from: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_repeated_runs\run_3\checkpoint-3198
Saving the overall best model to: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_best
Overall best model saved.


In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer # Import Trainer here as it's used in the copied function
import collections # Import collections as it's used in postprocess_qa_predictions
from tqdm.auto import tqdm # Import tqdm as it's used in postprocess_qa_predictions


# Define the learning rate to test in this cell
learning_rate_to_test = 1e-5
print(f"--- Testing Learning Rate: {learning_rate_to_test} ---")




# Re-define postprocess_qa_predictions function (copied from 37be750c)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 continue
            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Re-define prepare_validation_features function (copied from 37be750c)
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from 37be750c, modified to use learning_rate_to_test)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate):
    print(f"\n--- Starting Run {run_number} ---")
    run_output_dir = os.path.join(finetuning_output_dir, f'lr_{learning_rate}', f'run_{run_number}')

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to TypeError
        learning_rate=learning_rate, # Use the provided learning rate
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
    )

    def compute_metrics(eval_pred):
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    trainer.train()
    print("Training completed.")

    print("Evaluating model...")
    raw_predictions = trainer.predict(eval_dataset)

    if not isinstance(eval_dataset, list):
         eval_features_list = [eval_dataset[i] for i in range(len(eval_dataset))]
    else:
         eval_features_list = eval_dataset

    predictions = postprocess_qa_predictions(original_eval_dataset, eval_features_list, raw_predictions)

    metric = evaluate.load("squad")
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = []
    for example in original_eval_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation Metrics for Run {run_number}: {metrics}")

    return metrics

# --- End of copied and modified code ---


# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation (needed for all runs)
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=original_dev_dataset.column_names
)


# Collect results from all runs for this learning rate
all_results_lr = []

# We don't have a first run result for this specific learning rate unless it was 5e-5
# Let's run 3 new runs for this learning rate
for run_number in range(1, 4): # Runs 1, 2, and 3 for this learning rate
    # Load a fresh model for each run
    model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

    # Run fine-tuning and evaluation with the specified learning rate
    metrics = run_finetuning_and_evaluation(
        run_number=run_number,
        model=model,
        train_dataset=train_dataset_with_labels,
        eval_dataset=dev_features_dataset,
        original_eval_dataset=original_dev_dataset,
        tokenizer=tokenizer,
        finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_lr_experiments'), # Use a specific output dir for LR experiments
        learning_rate=learning_rate_to_test # Pass the learning rate
    )
    all_results_lr.append(metrics)


# Calculate and display average metrics for this learning rate
avg_exact_match_lr = np.mean([result['exact_match'] for result in all_results_lr])
avg_f1_lr = np.mean([result['f1'] for result in all_results_lr])

print(f"\n--- Average Evaluation Metrics for Learning Rate {learning_rate_to_test} Across 3 Runs ---")
print(f"Average Exact Match: {avg_exact_match_lr:.2f}")
print(f"Average F1 Score: {avg_f1_lr:.2f}")

--- Testing Learning Rate: 1e-05 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 ---
Training model...


Step,Training Loss
500,3.8339
1000,3.4982
1500,3.3918
2000,3.3003
2500,3.2224
3000,3.183


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 14.780782357574166, 'f1': 39.01603853283959}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 ---
Training model...


Step,Training Loss
500,3.8232
1000,3.5
1500,3.4018
2000,3.3237
2500,3.2474
3000,3.2183


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 13.888159621948018, 'f1': 37.76542454277153}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 ---
Training model...


Step,Training Loss
500,3.8232
1000,3.5
1500,3.4018
2000,3.3238
2500,3.2474
3000,3.2183


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 13.888159621948018, 'f1': 37.73607568973918}

--- Average Evaluation Metrics for Learning Rate 1e-05 Across 3 Runs ---
Average Exact Match: 14.19
Average F1 Score: 38.17


In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer # Import Trainer here as it's used in the copied function
import collections # Import collections as it's used in postprocess_qa_predictions
from tqdm.auto import tqdm # Import tqdm as it's used in postprocess_qa_predictions

# Define the learning rates to test
learning_rates_to_test = [1e-5, 3e-5, 5e-5]
all_lr_average_results = {}

print("--- Starting Learning Rate Experiment ---")


# --- Start of copied code for functions from cell 37be750c ---

# Re-define postprocess_qa_predictions function (copied from 37be750c)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 continue
            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Re-define prepare_validation_features function (copied from 37be750c)
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from 37be750c, modified to use passed learning_rate)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate):
    print(f"\n--- Starting Run {run_number} with LR: {learning_rate} ---")
    run_output_dir = os.path.join(finetuning_output_dir, f'lr_{learning_rate}', f'run_{run_number}')

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to TypeError
        learning_rate=learning_rate, # Use the provided learning rate
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
    )

    def compute_metrics(eval_pred):
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    trainer.train()
    print("Training completed.")

    print("Evaluating model...")
    raw_predictions = trainer.predict(eval_dataset)

    if not isinstance(eval_dataset, list):
         eval_features_list = [eval_dataset[i] for i in range(len(eval_dataset))]
    else:
         eval_features_list = eval_dataset

    predictions = postprocess_qa_predictions(original_eval_dataset, eval_features_list, raw_predictions)

    metric = evaluate.load("squad")
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = []
    for example in original_eval_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation Metrics for Run {run_number}: {metrics}")

    return metrics

# --- End of copied code for functions ---


# Assuming 'finetuning_path' is already defined
# Assuming 'dataset' is the original dataset loaded from cell 1a33e97c
# Assuming 'tokenizer' is already loaded from cell 9d44554b

# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation (needed for all runs)
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=original_dev_dataset.column_names
)


# Iterate through each learning rate
for lr_to_test in learning_rates_to_test:
    print(f"\n--- Running Experiment for Learning Rate: {lr_to_test} ---")

    all_results_lr = []

    # Run 3 fine-tuning and evaluation cycles for the current learning rate
    for run_number in range(1, 4): # Runs 1, 2, and 3
        # Load a fresh model for each run
        model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

        # Run fine-tuning and evaluation with the specified learning rate
        metrics = run_finetuning_and_evaluation(
            run_number=run_number,
            model=model,
            train_dataset=train_dataset_with_labels,
            eval_dataset=dev_features_dataset,
            original_eval_dataset=original_dev_dataset,
            tokenizer=tokenizer,
            finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_lr_experiments'), # Use a specific output dir for LR experiments
            learning_rate=lr_to_test # Pass the current learning rate from the loop
        )
        all_results_lr.append(metrics)

    # Calculate and store average metrics for the current learning rate
    avg_exact_match_lr = np.mean([result['exact_match'] for result in all_results_lr])
    avg_f1_lr = np.mean([result['f1'] for result in all_results_lr])
    all_lr_average_results[lr_to_test] = {'average_exact_match': avg_exact_match_lr, 'average_f1': avg_f1_lr}

    print(f"\n--- Finished Experiment for Learning Rate: {lr_to_test} ---")
    print(f"Average Exact Match: {avg_exact_match_lr:.2f}")
    print(f"Average F1 Score: {avg_f1_lr:.2f}")


print("\n--- Summary of Learning Rate Experiment Results ---")
print("Learning Rate | Average Exact Match | Average F1 Score")
print("----------------|-----------------------|-------------------")
for lr, results in all_lr_average_results.items():
    print(f"{lr:<15} | {results['average_exact_match']:<21.2f} | {results['average_f1']:<17.2f}")

--- Starting Learning Rate Experiment ---

--- Running Experiment for Learning Rate: 1e-05 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 1e-05 ---
Training model...


Step,Training Loss
500,3.8232
1000,3.5
1500,3.4018
2000,3.3236
2500,3.2474
3000,3.2183


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 13.86190601207666, 'f1': 37.78838356973786}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 1e-05 ---
Training model...


Step,Training Loss
500,3.8232
1000,3.5
1500,3.4018
2000,3.3237
2500,3.2474
3000,3.2183


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 13.86190601207666, 'f1': 37.78939355820469}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 1e-05 ---
Training model...


Step,Training Loss
500,3.8232
1000,3.5001
1500,3.4019
2000,3.3238
2500,3.2473
3000,3.2185


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 13.809398792333946, 'f1': 37.721884407723124}

--- Finished Experiment for Learning Rate: 1e-05 ---
Average Exact Match: 13.84
Average F1 Score: 37.77

--- Running Experiment for Learning Rate: 3e-05 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 3e-05 ---
Training model...


Step,Training Loss
500,3.6372
1000,3.3255
1500,2.9668
2000,2.7286
2500,2.5229
3000,2.4353


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 25.30847991598845, 'f1': 52.69377346607114}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 3e-05 ---
Training model...


Step,Training Loss
500,3.6372
1000,3.3255
1500,2.9669
2000,2.7287
2500,2.5227
3000,2.4353


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 25.413494355473876, 'f1': 52.68974740448363}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 3e-05 ---
Training model...


Step,Training Loss
500,3.6372
1000,3.3255
1500,2.9669
2000,2.7286
2500,2.5227
3000,2.4353


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 25.46600157521659, 'f1': 52.77781223130832}

--- Finished Experiment for Learning Rate: 3e-05 ---
Average Exact Match: 25.40
Average F1 Score: 52.72

--- Running Experiment for Learning Rate: 5e-05 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 5e-05 ---
Training model...


Step,Training Loss
500,3.5817
1000,3.0848
1500,2.5984
2000,2.4302
2500,2.2143
3000,2.1246


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 27.75006563402468, 'f1': 56.336138015352184}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 5e-05 ---
Training model...


Step,Training Loss
500,3.5812
1000,3.0933
1500,2.6093
2000,2.4388
2500,2.2196
3000,2.1274


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 27.198739826726175, 'f1': 56.11425355204286}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 5e-05 ---
Training model...


Step,Training Loss
500,3.5813
1000,3.0902
1500,2.607
2000,2.435
2500,2.2202
3000,2.1267


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 27.277500656340248, 'f1': 55.856280024765915}

--- Finished Experiment for Learning Rate: 5e-05 ---
Average Exact Match: 27.41
Average F1 Score: 56.10

--- Summary of Learning Rate Experiment Results ---
Learning Rate | Average Exact Match | Average F1 Score
----------------|-----------------------|-------------------
1e-05           | 13.84                 | 37.77            
3e-05           | 25.40                 | 52.72            
5e-05           | 27.41                 | 56.10            


In [None]:
import pandas as pd

# Assuming 'all_lr_average_results' is populated from cell 1ec19096
# Assuming 'first_run_metrics' is the result from your initial single run (provided by user)

first_run_metrics = {'exact_match': 26.43738514045681, 'f1': 55.35870308817166}
first_run_lr = 5e-5 # Add the learning rate for the first run


# Create a dictionary to hold the comparison data
comparison_data = {}

# Add the first run results with its learning rate
comparison_data[f'First Run (LR={first_run_lr})'] = {
    'Average Exact Match': first_run_metrics['exact_match'],
    'Average F1 Score': first_run_metrics['f1']
}

# Add the average results for each learning rate from Experiment 1
for lr, results in all_lr_average_results.items():
    comparison_data[f'Experiment 1 (LR={lr})'] = {
        'Average Exact Match': results['average_exact_match'],
        'Average F1 Score': results['average_f1']
    }

# Convert to a pandas DataFrame for better display
comparison_df = pd.DataFrame.from_dict(comparison_data, orient='index')

print("--- Comparison of Evaluation Metrics ---")
display(comparison_df)

--- Comparison of Evaluation Metrics ---


Unnamed: 0,Average Exact Match,Average F1 Score
First Run (LR=5e-05),26.437385,55.358703
Experiment 1 (LR=1e-05),13.844404,37.766554
Experiment 1 (LR=3e-05),25.395992,52.720444
Experiment 1 (LR=5e-05),27.408769,56.102224


In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer # Import Trainer here as it's used in the copied function
import collections # Import collections as it's used in postprocess_qa_predictions
from tqdm.auto import tqdm # Import tqdm as it's used in postprocess_qa_predictions

# Define the epoch values to test
epochs_to_test = [2, 5, 10]
# Define the learning rate to use for this experiment
learning_rate_for_epochs = 5e-5
all_epoch_average_results = {}

print(f"--- Starting Number of Epochs Experiment (using LR={learning_rate_for_epochs}) ---")


# --- Start of copied code for functions from cell 1ec19096 ---

# Re-define postprocess_qa_predictions function (copied from 1ec19096)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 continue
            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Re-define prepare_validation_features function (copied from 1ec19096)
def prepare_validation_features(examples):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from 1ec19096, modified to accept epochs)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate, num_train_epochs):
    print(f"\n--- Starting Run {run_number} with LR: {learning_rate}, Epochs: {num_train_epochs} ---")
    # Include learning rate and epochs in the output directory name
    run_output_dir = os.path.join(finetuning_output_dir, f'lr_{learning_rate}', f'epochs_{num_train_epochs}', f'run_{run_number}')

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to TypeError
        learning_rate=learning_rate, # Use the provided learning rate
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs, # Use the provided number of epochs
        weight_decay=0.01,
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
    )

    def compute_metrics(eval_pred):
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    trainer.train()
    print("Training completed.")

    print("Evaluating model...")
    raw_predictions = trainer.predict(eval_dataset)

    if not isinstance(eval_dataset, list):
         eval_features_list = [eval_dataset[i] for i in range(len(eval_dataset))]
    else:
         eval_features_list = eval_dataset

    predictions = postprocess_qa_predictions(original_eval_dataset, eval_features_list, raw_predictions)

    metric = evaluate.load("squad")
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = []
    for example in original_eval_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation Metrics for Run {run_number}: {metrics}")

    return metrics

# --- End of copied code for functions ---




# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation (needed for all runs)
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=original_dev_dataset.column_names
)


# Iterate through each epoch value
for epochs_to_run in epochs_to_test:
    print(f"\n--- Running Experiment for Epochs: {epochs_to_run} (using LR={learning_rate_for_epochs}) ---")

    all_results_epochs = []

    # Run 3 fine-tuning and evaluation cycles for the current number of epochs
    for run_number in range(1, 4): # Runs 1, 2, and 3
        # Load a fresh model for each run
        model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

        # Run fine-tuning and evaluation with the specified learning rate and number of epochs
        metrics = run_finetuning_and_evaluation(
            run_number=run_number,
            model=model,
            train_dataset=train_dataset_with_labels,
            eval_dataset=dev_features_dataset,
            original_eval_dataset=original_dev_dataset,
            tokenizer=tokenizer,
            finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_epochs_experiments'), # Use a specific output dir for epochs experiments
            learning_rate=learning_rate_for_epochs, # Use the specified learning rate
            num_train_epochs=epochs_to_run # Pass the current number of epochs
        )
        all_results_epochs.append(metrics)

    # Calculate and store average metrics for the current number of epochs
    avg_exact_match_epochs = np.mean([result['exact_match'] for result in all_results_epochs])
    avg_f1_epochs = np.mean([result['f1'] for result in all_results_epochs])
    all_epoch_average_results[epochs_to_run] = {'average_exact_match': avg_exact_match_epochs, 'average_f1': avg_f1_epochs}

    print(f"\n--- Finished Experiment for Epochs: {epochs_to_run} ---")
    print(f"Average Exact Match: {avg_exact_match_epochs:.2f}")
    print(f"Average F1 Score: {avg_f1_epochs:.2f}")


print("\n--- Summary of Number of Epochs Experiment Results ---")
print(f"Using Learning Rate: {learning_rate_for_epochs}")
print("Epochs | Average Exact Match | Average F1 Score")
print("-------|-----------------------|-------------------")
for epochs, results in all_epoch_average_results.items():
    print(f"{epochs:<6} | {results['average_exact_match']:<21.2f} | {results['average_f1']:<17.2f}")

--- Starting Number of Epochs Experiment (using LR=5e-05) ---

--- Running Experiment for Epochs: 2 (using LR=5e-05) ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 5e-05, Epochs: 2 ---
Training model...


Step,Training Loss
500,3.5457
1000,2.9672
1500,2.5651
2000,2.423


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 26.27986348122867, 'f1': 54.770109327478345}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 5e-05, Epochs: 2 ---
Training model...


Step,Training Loss
500,3.5715
1000,3.0324
1500,2.5919
2000,2.4417


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 25.938566552901023, 'f1': 54.00620260348468}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 5e-05, Epochs: 2 ---
Training model...


Step,Training Loss
500,3.5722
1000,3.0466
1500,2.5984
2000,2.4468


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 25.80729850354424, 'f1': 53.95833492443847}

--- Finished Experiment for Epochs: 2 ---
Average Exact Match: 26.01
Average F1 Score: 54.24

--- Running Experiment for Epochs: 5 (using LR=5e-05) ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 5e-05, Epochs: 5 ---
Training model...


Step,Training Loss
500,3.5686
1000,2.994
1500,2.5355
2000,2.3948
2500,2.15
3000,2.0727
3500,1.8948
4000,1.7908
4500,1.7023
5000,1.6025


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 30.217904961932266, 'f1': 58.017669120396626}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 5e-05, Epochs: 5 ---
Training model...


Step,Training Loss
500,3.5813
1000,3.1446
1500,2.6494
2000,2.4479
2500,2.1998
3000,2.1087
3500,1.9381
4000,1.8292
4500,1.7404
5000,1.6384


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 29.456550275662902, 'f1': 57.27793132528488}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 5e-05, Epochs: 5 ---
Training model...


Step,Training Loss
500,3.5805
1000,3.1316
1500,2.6352
2000,2.4455
2500,2.1956
3000,2.1028
3500,1.9272
4000,1.8242
4500,1.7301
5000,1.6342


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 29.351535836177476, 'f1': 57.15584923369056}

--- Finished Experiment for Epochs: 5 ---
Average Exact Match: 29.68
Average F1 Score: 57.48

--- Running Experiment for Epochs: 10 (using LR=5e-05) ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 5e-05, Epochs: 10 ---
Training model...


Step,Training Loss
500,3.5809
1000,3.1136
1500,2.6138
2000,2.4343
2500,2.1736
3000,2.0939
3500,1.8975
4000,1.7982
4500,1.6685
5000,1.5492


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 28.931478078235756, 'f1': 57.21359808650686}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 5e-05, Epochs: 10 ---
Training model...


Step,Training Loss
500,3.5787
1000,3.1497
1500,2.6475
2000,2.4503
2500,2.1902
3000,2.1025
3500,1.915
4000,1.8037
4500,1.6775
5000,1.5601


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 29.509057495405617, 'f1': 57.79329576053231}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 5e-05, Epochs: 10 ---
Training model...


Step,Training Loss
500,3.5797
1000,3.1484
1500,2.6502
2000,2.453
2500,2.1903
3000,2.1018
3500,1.919
4000,1.8
4500,1.6779
5000,1.5591


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 29.062746127592543, 'f1': 57.069057015611264}

--- Finished Experiment for Epochs: 10 ---
Average Exact Match: 29.17
Average F1 Score: 57.36

--- Summary of Number of Epochs Experiment Results ---
Using Learning Rate: 5e-05
Epochs | Average Exact Match | Average F1 Score
-------|-----------------------|-------------------
2      | 26.01                 | 54.24            
5      | 29.68                 | 57.48            
10     | 29.17                 | 57.36            


### Experiment 4.1: Influence of Stride (Part 1)

**Configuration:** `max_length = 512`, `stride = 64`

**Reasoning:** We are testing a smaller stride (64) compared to the baseline (128) to see if increasing the overlap between context chunks improves the model's ability to find answers that might be split across chunk boundaries. This comes at the cost of generating more features per original example, increasing computation time.

The following cell will run the fine-tuning and evaluation process 3 times with these parameters, using the fixed Learning Rate of 5e-05 and 5 Epochs, and then calculate the average metrics.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
from tqdm.auto import tqdm

# Define the configuration for this run
current_max_length = 512
current_stride = 64
fixed_learning_rate = 5e-5
fixed_num_epochs = 5

print(f"--- Starting Experiment 4.1 (Part 1): Max Length={current_max_length}, Stride={current_stride} ---")


# --- Start of copied code for functions from cell 323b5a71 ---

# Re-define postprocess_qa_predictions function (copied from 323b5a71)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 continue
            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Re-define prepare_validation_features function (copied from 323b5a71, modified to use current_max_length and current_stride)
def prepare_validation_features(examples, tokenizer, max_length, stride): # Added tokenizer, max_length, stride as args
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length, # Use provided max_length
        stride=stride, # Use provided stride
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from 323b5a71, modified to accept max_length and stride)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate, num_train_epochs, max_length, stride): # Added max_length and stride as args
    print(f"\n--- Starting Run {run_number} with LR: {learning_rate}, Epochs: {num_train_epochs}, Max Length: {max_length}, Stride: {stride} ---")
    # Include all hyperparameters in the output directory name for clarity
    run_output_dir = os.path.join(finetuning_output_dir, f'lr_{learning_rate}', f'epochs_{num_train_epochs}', f'maxlen_{max_length}', f'stride_{stride}', f'run_{run_number}')

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to TypeError
        learning_rate=learning_rate,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        # load_best_model_at_last_step=True, # Removed due to TypeError due to potential version incompatibility
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
    )

    def compute_metrics(eval_pred):
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    trainer.train()
    print("Training completed.")

    print("Evaluating model...")
    raw_predictions = trainer.predict(eval_dataset)

    if not isinstance(eval_dataset, list):
         eval_features_list = [eval_dataset[i] for i in range(len(eval_dataset))]
    else:
         eval_features_list = eval_dataset

    predictions = postprocess_qa_predictions(original_eval_dataset, eval_features_list, raw_predictions)

    metric = evaluate.load("squad")
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = []
    for example in original_eval_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation Metrics for Run {run_number}: {metrics}")

    return metrics

# --- End of copied code for functions ---


# Assuming 'finetuning_path' is already defined
# Assuming 'dataset' is the original dataset loaded from cell 1a33e97c
# Assuming 'tokenizer' is already loaded from cell 9d44554b

# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation with the current max_length and stride
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    lambda examples: prepare_validation_features(examples, tokenizer, current_max_length, current_stride), # Pass tokenizer, max_length, stride
    batched=True,
    remove_columns=original_dev_dataset.column_names
)


# Collect results from all runs for this configuration
all_results_config1 = []

# Run 3 fine-tuning and evaluation cycles
for run_number in range(1, 4): # Runs 1, 2, and 3
    # Load a fresh model for each run
    model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

    # Run fine-tuning and evaluation with the specified parameters
    metrics = run_finetuning_and_evaluation(
        run_number=run_number,
        model=model,
        train_dataset=train_dataset_with_labels,
        eval_dataset=dev_features_dataset,
        original_eval_dataset=original_dev_dataset,
        tokenizer=tokenizer,
        finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_stride_maxlen_experiments'), # Specific output dir
        learning_rate=fixed_learning_rate,
        num_train_epochs=fixed_num_epochs,
        max_length=current_max_length, # Pass max_length
        stride=current_stride # Pass stride
    )
    all_results_config1.append(metrics)

# Calculate and store average metrics for this configuration
avg_exact_match_config1 = np.mean([result['exact_match'] for result in all_results_config1])
avg_f1_config1 = np.mean([result['f1'] for result in all_results_config1])
average_results_config1 = {'average_exact_match': avg_exact_match_config1, 'average_f1': avg_f1_config1}

print(f"\n--- Finished Experiment 4.1 (Part 1): Max Length={current_max_length}, Stride={current_stride} ---")
print(f"Average Exact Match: {avg_exact_match_config1:.2f}")
print(f"Average F1 Score: {avg_f1_config1:.2f}")

# Store results for later comparison
if 'all_stride_maxlen_average_results' not in locals():
    all_stride_maxlen_average_results = {}
all_stride_maxlen_average_results[f'maxlen_{current_max_length}_stride_{current_stride}'] = average_results_config1

--- Starting Experiment 4.1 (Part 1): Max Length=512, Stride=64 ---


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 64 ---
Training model...


Step,Training Loss
500,3.5794
1000,3.166
1500,2.6636
2000,2.458
2500,2.2115
3000,2.1097
3500,1.9458
4000,1.8308
4500,1.7471
5000,1.6424


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 29.876608033604622, 'f1': 57.8276665049525}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 64 ---
Training model...


Step,Training Loss
500,3.5805
1000,3.133
1500,2.6348
2000,2.4441
2500,2.1953
3000,2.1034
3500,1.9279
4000,1.8232
4500,1.7288
5000,1.6319


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 29.22026778682069, 'f1': 57.26764311074921}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(



--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 64 ---
Training model...


Step,Training Loss
500,3.5815
1000,3.1381
1500,2.6386
2000,2.4467
2500,2.1962
3000,2.1088
3500,1.9308
4000,1.8282
4500,1.7321
5000,1.6372


Training completed.
Evaluating model...


Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 29.561564715148332, 'f1': 57.26459775717648}

--- Finished Experiment 4.1 (Part 1): Max Length=512, Stride=64 ---
Average Exact Match: 29.55
Average F1 Score: 57.45


### Experiment 4.1: Influence of Stride (Part 2)

**Configuration:** `max_length = 512`, `stride = 256`

**Reasoning:** We are now testing a larger stride (256) compared to the baseline (128) and the previous configuration (64). This decreases the overlap between context chunks. This configuration is more computationally efficient as it generates fewer features per original example, but it might negatively impact performance if answers are frequently located at the boundaries of the original chunks and are missed due to less overlap.

The following cell will run the fine-tuning and evaluation process 3 times with these parameters, using the fixed Learning Rate of 5e-05 and 5 Epochs, and then calculate the average metrics.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
from tqdm.auto import tqdm

# Define the configuration for this run
current_max_length = 512
current_stride = 256 # Testing stride 256
fixed_learning_rate = 5e-5
fixed_num_epochs = 5

print(f"--- Starting Experiment 4.1 (Part 2): Max Length={current_max_length}, Stride={current_stride} ---")


# --- Start of copied code for functions from cell a9786d9f ---

# Re-define postprocess_qa_predictions function (copied from a9786d9f)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

    for qid, qa_info in tqdm(qas_by_id.items()):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 continue
            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}

        predictions[qid] = best_answer["text"]

    return predictions

# Re-define prepare_validation_features function (copied from a9786d9f, modified to use current_max_length and current_stride)
def prepare_validation_features(examples, tokenizer, max_length, stride): # Added tokenizer, max_length, stride as args
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length, # Use provided max_length
        stride=stride, # Use provided stride
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from a9786d9f, modified to accept max_length and stride)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate, num_train_epochs, max_length, stride): # Added max_length and stride as args
    print(f"\n--- Starting Run {run_number} with LR: {learning_rate}, Epochs: {num_train_epochs}, Max Length: {max_length}, Stride: {stride} ---")
    # Include all hyperparameters in the output directory name for clarity
    run_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_stride_maxlen_experiments', f'lr_{learning_rate}', f'epochs_{num_train_epochs}', f'maxlen_{max_length}', f'stride_{stride}', f'run_{run_number}')

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to TypeError due to potential version incompatibility
        learning_rate=learning_rate,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        # load_best_model_at_last_step=True, # Removed due to TypeError due to potential version incompatibility
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
    )

    def compute_metrics(eval_pred):
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    trainer.train()
    print("Training completed.")

    print("Evaluating model...")
    raw_predictions = trainer.predict(eval_dataset)

    if not isinstance(eval_dataset, list):
         eval_features_list = [eval_dataset[i] for i in range(len(eval_dataset))]
    else:
         eval_features_list = eval_dataset

    predictions = postprocess_qa_predictions(original_dev_dataset, eval_features_list, raw_predictions)

    metric = evaluate.load("squad")
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = []
    for example in original_dev_dataset: # Corrected variable name from original_eval_dataset to original_dev_dataset
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers] # Corrected answer_starts extraction
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation Metrics for Run {run_number}: {metrics}")

    return metrics

# --- End of copied code for functions ---




# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation with the current max_length and stride
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    lambda examples: prepare_validation_features(examples, tokenizer, current_max_length, current_stride), # Pass tokenizer, max_length, stride
    batched=True,
    remove_columns=original_dev_dataset.column_names
)


# Collect results from all runs for this configuration
all_results_config2 = []

# Run 3 fine-tuning and evaluation cycles
for run_number in range(1, 4): # Runs 1, 2, and 3
    # Load a fresh model for each run
    model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

    # Run fine-tuning and evaluation with the specified parameters
    metrics = run_finetuning_and_evaluation(
        run_number=run_number,
        model=model,
        train_dataset=train_dataset_with_labels,
        eval_dataset=dev_features_dataset,
        original_eval_dataset=original_dev_dataset,
        tokenizer=tokenizer,
        finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_stride_maxlen_experiments'), # Specific output dir
        learning_rate=fixed_learning_rate,
        num_train_epochs=fixed_num_epochs,
        max_length=current_max_length, # Pass max_length
        stride=current_stride # Pass stride
    )
    all_results_config2.append(metrics)

# Calculate and store average metrics for this configuration
avg_exact_match_config2 = np.mean([result['exact_match'] for result in all_results_config2])
avg_f1_config2 = np.mean([result['f1'] for result in all_results_config2])
average_results_config2 = {'average_exact_match': avg_exact_match_config2, 'average_f1': avg_f1_config2}

print(f"\n--- Finished Experiment 4.1 (Part 2): Max Length={current_max_length}, Stride={current_stride} ---")
print(f"Average Exact Match: {avg_exact_match_config2:.2f}")
print(f"Average F1 Score: {avg_f1_config2:.2f}")

# Store results for later comparison
if 'all_stride_maxlen_average_results' not in locals():
    all_stride_maxlen_average_results = {}
all_stride_maxlen_average_results[f'maxlen_{current_max_length}_stride_{current_stride}'] = average_results_config2

In [None]:
import pandas as pd

# Assuming all_lr_average_results is from cell 1ec19096 or 017801c2
# Assuming all_epoch_average_results is from cell 323b5a71
# Assuming all_stride_maxlen_average_results is from cells a9786d9f, 67105e6b, and the baseline from epoch experiment

# Collect all average results into a list of dictionaries
all_average_results_list = []

# Results from Experiment 1 (Learning Rate) - Epochs=3, MaxLength=512, Stride=128
# We need to ensure all_lr_average_results is available. If not, try to reconstruct from print statements or assume.
# Assuming all_lr_average_results is available from previous runs.
if 'all_lr_average_results' in locals():
    for lr, metrics in all_lr_average_results.items():
        all_average_results_list.append({
            'Experiment': 'Learning Rate (Exp 1)',
            'Learning Rate': lr,
            'Epochs': 3, # Fixed for this experiment
            'Max Length': 512, # Fixed for this experiment
            'Stride': 128, # Fixed for this experiment
            'Average Exact Match': metrics['average_exact_match'],
            'Average F1 Score': metrics['average_f1']
        })
else:
    print("Warning: 'all_lr_average_results' not found. Skipping results from Learning Rate Experiment.")


# Results from Experiment 2 (Epochs) - LR=5e-5, MaxLength=512, Stride=128
# Assuming all_epoch_average_results is available from previous runs.
if 'all_epoch_average_results' in locals():
     for epochs, metrics in all_epoch_average_results.items():
        all_average_results_list.append({
            'Experiment': 'Epochs (Exp 2)',
            'Learning Rate': 5e-5, # Fixed for this experiment
            'Epochs': epochs,
            'Max Length': 512, # Fixed for this experiment
            'Stride': 128, # Fixed for this experiment
            'Average Exact Match': metrics['average_exact_match'],
            'Average F1 Score': metrics['average_f1']
        })
else:
     print("Warning: 'all_epoch_average_results' not found. Skipping results from Epochs Experiment.")


# Results from Experiment 4.1 (Stride) - LR=5e-5, Epochs=5, MaxLength=512
# Assuming all_stride_maxlen_average_results is available from previous runs.
if 'all_stride_maxlen_average_results' in locals():
    for config_key, metrics in all_stride_maxlen_average_results.items():
        # Extract max_length and stride from the config_key (e.g., 'maxlen_512_stride_64')
        parts = config_key.split('_')
        max_length = int(parts[1])
        stride = int(parts[3])

        all_average_results_list.append({
            'Experiment': 'Stride (Exp 4.1)',
            'Learning Rate': 5e-5, # Fixed for this experiment
            'Epochs': 5, # Fixed for this experiment
            'Max Length': max_length,
            'Stride': stride,
            'Average Exact Match': metrics['average_exact_match'],
            'Average F1 Score': metrics['average_f1']
        })
else:
     print("Warning: 'all_stride_maxlen_average_results' not found. Skipping results from Stride Experiment.")


# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(all_average_results_list)

# Sort by Average F1 Score to find the best performing configuration
if not results_df.empty:
    best_config = results_df.loc[results_df['Average F1 Score'].idxmax()]

    print("--- Summary of All Experiment Average Results ---")
    display(results_df.sort_values(by='Average F1 Score', ascending=False))

    print("\n--- Best Performing Model Configuration (based on Average F1 Score) ---")
    display(best_config)
else:
    print("\nNo experiment results found to compare.")

--- Summary of All Experiment Average Results ---


Unnamed: 0,Experiment,Learning Rate,Epochs,Max Length,Stride,Average Exact Match,Average F1 Score
4,Epochs (Exp 2),5e-05,5,512,128,29.67533,57.483817
6,Stride (Exp 4.1),5e-05,5,512,64,29.552814,57.453302
5,Epochs (Exp 2),5e-05,10,512,128,29.167761,57.35865
7,Stride (Exp 4.1),5e-05,5,512,256,29.167761,57.267354
2,Learning Rate (Exp 1),5e-05,3,512,128,27.408769,56.102224
3,Epochs (Exp 2),5e-05,2,512,128,26.008576,54.244882
1,Learning Rate (Exp 1),3e-05,3,512,128,25.395992,52.720444
0,Learning Rate (Exp 1),1e-05,3,512,128,13.844404,37.766554



--- Best Performing Model Configuration (based on Average F1 Score) ---


Experiment             Epochs (Exp 2)
Learning Rate                 0.00005
Epochs                              5
Max Length                        512
Stride                            128
Average Exact Match          29.67533
Average F1 Score            57.483817
Name: 4, dtype: object

In [None]:
import os
from transformers import AutoModelForQuestionAnswering

# Assuming 'best_config' is populated from cell b2f32dee
# Assuming 'finetuning_path' is defined

# Extract hyperparameters of the best configuration from the best_config Series
best_lr = best_config['Learning Rate']
best_epochs = int(best_config['Epochs']) # Ensure epochs is int for path
best_maxlen = int(best_config['Max Length']) # Ensure maxlen is int for path
best_stride = int(best_config['Stride']) # Ensure stride is int for path

print(f"Loading model for best performing configuration: LR={best_lr}, Epochs={best_epochs}, Max Length={best_maxlen}, Stride={best_stride}")

# Define the base output directory for the Epochs experiment (where this config was run)
# Assuming the structure is finetuning_path/legal-bert-small-uncased-qa-finetuned_epochs_experiments/lr_{lr}/epochs_{epochs}/run_{run_number}/checkpoint-...
base_experiment_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_epochs_experiments')

# Define the configuration-specific directory path
config_output_dir = os.path.join(base_experiment_output_dir, f'lr_{best_lr}', f'epochs_{best_epochs}')

# Search for the latest checkpoint across all runs within this configuration's directory
latest_checkpoint_path = None
latest_step = -1

if os.path.exists(config_output_dir):
    print(f"Searching for checkpoints in: {config_output_dir}")
    # Walk through the configuration directory to find checkpoint folders
    for root, dirs, files in os.walk(config_output_dir):
        for dir_name in dirs:
            if dir_name.startswith("checkpoint-"):
                try:
                    step = int(dir_name.split("-")[-1])
                    if step > latest_step:
                        latest_step = step
                        latest_checkpoint_path = os.path.join(root, dir_name)
                except ValueError:
                    # Ignore directories that look like checkpoints but don't have a valid step number
                    pass

if latest_checkpoint_path:
    print(f"Found latest checkpoint at: {latest_checkpoint_path}")

    # Load the model from the latest checkpoint
    try:
        best_model = AutoModelForQuestionAnswering.from_pretrained(latest_checkpoint_path)

        # Define the path to save the overall best model
        overall_best_model_save_path = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_best_overall')
        print(f"Saving the overall best model to: {overall_best_model_save_path}")

        # Save the overall best model
        best_model.save_pretrained(overall_best_model_save_path)

        print("Overall best model saved successfully.")

    except Exception as e:
        print(f"Error loading or saving model from checkpoint {latest_checkpoint_path}: {e}")

else:
    print(f"\nCould not find any model checkpoints in the directory for configuration LR={best_lr}, Epochs={best_epochs}, Max Length={best_maxlen}, Stride={best_stride}.")
    print("Please ensure that the experiment run for this configuration completed successfully and saved checkpoints.")

Loading model for best performing configuration: LR=5e-05, Epochs=5, Max Length=512, Stride=128
Searching for checkpoints in: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_epochs_experiments\lr_5e-05\epochs_5
Found latest checkpoint at: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_epochs_experiments\lr_5e-05\epochs_5\run_1\checkpoint-5330
Saving the overall best model to: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_best_overall
Overall best model saved successfully.


In [None]:
import os
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
import evaluate
import collections
from tqdm.auto import tqdm
import numpy as np # Needed for postprocessing

# Assuming 'finetuning_path' is already defined
# Assuming 'dataset' is the original dataset loaded from cell 1a33e97c
# Assuming 'tokenizer' is already loaded from cell 9d44554b
# Assuming the best model was saved to os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_best_overall')

# Define the path where the best model was saved
best_model_save_path = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_best_overall')

print(f"Loading the best performing model from: {best_model_save_path}")
# Load the best performing model
try:
    best_model = AutoModelForQuestionAnswering.from_pretrained(best_model_save_path)
    print("Best model loaded successfully.")
except Exception as e:
    print(f"Error loading best model: {e}")
    best_model = None # Set to None if loading fails


# Define the max_length and stride used for the best model's training and tokenization
# Based on the results in cell b2f32dee, the best configuration used these parameters:
best_model_max_length = 512
best_model_stride = 128


# Re-define prepare_validation_features function (copied from previous experiment cells)
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Prepare the dev dataset features using the parameters of the best model
original_dev_dataset = dataset['dev']

if best_model is not None:
    print(f"Preparing development dataset with Max Length={best_model_max_length} and Stride={best_model_stride}...")
    dev_features_for_analysis = original_dev_dataset.map(
        lambda examples: prepare_validation_features(examples, tokenizer, best_model_max_length, best_model_stride),
        batched=True,
        remove_columns=original_dev_dataset.column_names # Remove original columns
    )
    print("Development dataset prepared for analysis.")
    # print(dev_features_for_analysis) # Optional: display info about the prepared dataset
else:
    print("Skipping dataset preparation as the best model could not be loaded.")

Loading the best performing model from: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_best_overall
Best model loaded successfully.
Preparing development dataset with Max Length=512 and Stride=128...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Development dataset prepared for analysis.


In [None]:
import evaluate
from transformers import Trainer # Import Trainer as it's needed for prediction

if best_model is not None and 'dev_features_for_analysis' in locals():
    print("Running evaluation on the best model...")

    # We need a Trainer instance to run predict, but no training is needed.
    # We can create a minimal TrainingArguments just for prediction.
    from transformers import TrainingArguments
    # Use a temporary directory for output during prediction
    temp_output_dir = os.path.join(finetuning_path, 'temp_eval_output')
    prediction_args = TrainingArguments(
        output_dir=temp_output_dir,
        per_device_eval_batch_size=16, # Use a reasonable batch size for evaluation
        report_to="none", # Disable reporting
    )

    # Create a Trainer instance just for prediction
    trainer = Trainer(
        model=best_model,
        args=prediction_args,
        tokenizer=tokenizer,
        # No train_dataset is needed for prediction
        # No compute_metrics needed for prediction in this step
    )

    # Run prediction to get raw logits
    raw_predictions = trainer.predict(dev_features_for_analysis)
    print("Evaluation complete. Raw predictions obtained.")

    # Re-define postprocess_qa_predictions function (copied from b8c375e0)
    def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
        all_start_logits, all_end_logits = raw_predictions.predictions

        qas_by_id = {}
        for example in examples:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

        features_per_example = collections.defaultdict(list)
        for i, feature in enumerate(features):
            features_per_example[feature["example_id"]].append(i)

        predictions = collections.OrderedDict()

        print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.")

        for qid, qa_info in tqdm(qas_by_id.items()):
            feature_indices = features_per_example[qid]
            min_null_score = None
            valid_answers = []
            context = qa_info["context"]

            for feature_index in feature_indices:
                start_logits = all_start_logits[feature_index]
                end_logits = all_end_logits[feature_index]
                if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                     continue
                offset_mapping = features[feature_index]["offset_mapping"]
                input_ids = features[feature_index]["input_ids"]
                cls_index = input_ids.index(tokenizer.cls_token_id)
                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                if min_null_score is None or feature_null_score < min_null_score:
                    min_null_score = feature_null_score

                start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
                end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        if (
                            start_index >= len(offset_mapping)
                            or end_index >= len(offset_mapping)
                            or offset_mapping[start_index] is None
                            or offset_mapping[end_index] is None
                        ):
                            continue
                        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                            continue
                        if input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id:
                             continue

                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index] + end_logits[end_index],
                                "text": context[start_char: end_char]
                            }
                        )

            if len(valid_answers) > 0:
                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            else:
                best_answer = {"text": "", "score": 0.0}

            predictions[qid] = best_answer["text"]

        return predictions

    # Postprocess the raw predictions to get the predicted text answers
    # Need to convert dev_features_for_analysis to a list of dictionaries for postprocessing
    dev_features_list_for_postprocessing = [dev_features_for_analysis[i] for i in range(len(dev_features_for_analysis))]

    print("Post-processing predictions...")
    model_predictions = postprocess_qa_predictions(original_dev_dataset, dev_features_list_for_postprocessing, raw_predictions)
    print("Predictions post-processed.")


    # Prepare the references from the original dev dataset
    references = []
    for example in original_dev_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    # Compute metrics to get F1 and EM for each prediction
    metric = evaluate.load("squad")
    formatted_predictions_list = [{"id": k, "prediction_text": v} for k, v in model_predictions.items()]

    # We need to align predictions and references by ID to compare them easily
    # Create a dictionary of references by ID
    references_by_id = {ref['id']: ref for ref in references}

    # Create a list of tuples (prediction_id, prediction_text, reference_answers, exact_match, f1)
    comparison_list = []
    for pred in formatted_predictions_list:
        pred_id = pred['id']
        pred_text = pred['prediction_text']
        reference_data = references_by_id.get(pred_id)

        if reference_data:
            # Calculate EM and F1 for this single prediction
            # evaluate.compute expects lists of predictions and references, so we'll wrap them
            single_prediction_list = [pred]
            single_reference_list = [reference_data]
            scores = metric.compute(predictions=single_prediction_list, references=single_reference_list)

            comparison_list.append({
                'id': pred_id,
                'prediction_text': pred_text,
                'reference_answers': reference_data['answers']['text'], # List of possible ground truth answers
                'exact_match': scores['exact_match'],
                'f1': scores['f1']
            })
        else:
             print(f"Warning: No reference found for prediction ID: {pred_id}. Skipping comparison for this ID.")

    print("\nComparison of predictions and references created.")

    # Convert to DataFrame for easier analysis and display
    comparison_df = pd.DataFrame(comparison_list)

    # Identify incorrect predictions (e.g., Exact Match is 0)
    incorrect_predictions_df = comparison_df[comparison_df['exact_match'] == 0]

    print(f"\nFound {len(incorrect_predictions_df)} incorrect predictions (Exact Match = 0) out of {len(comparison_df)} total predictions.")

    # Optional: Display the first few incorrect predictions for manual inspection
    print("\nFirst 10 Incorrect Predictions (Exact Match = 0):")
    display(incorrect_predictions_df.head(10))

else:
    print("Best model or prepared dataset not available. Skipping error analysis.")

Running evaluation on the best model...


  trainer = Trainer(


Evaluation complete. Raw predictions obtained.
Post-processing predictions...
Post-processing predictions for 3809 questions from 3809 features.


  0%|          | 0/3809 [00:00<?, ?it/s]

Predictions post-processed.

Comparison of predictions and references created.

Found 2658 incorrect predictions (Exact Match = 0) out of 3809 total predictions.

First 10 Incorrect Predictions (Exact Match = 0):


Unnamed: 0,id,prediction_text,reference_answers,exact_match,f1
3,x3elyri8l6ol3zqr,"Contact information. For example, we might col...",[name and street address],0.0,32.0
4,hezwq0unnhmezwlw,We collect,[We collect information from and about you.],0.0,44.444444
5,i1matwosph3owqln,credit card number,"[Payment and billing information. For example,...",0.0,66.666667
7,v883988zklkj8bfe,we collect,[when you buy a ticket],0.0,0.0
9,wxoo1ii57ap6xerx,we collect,[we collect your credit card number],0.0,50.0
10,jpfv8krobtqfpoox,when you buy a ticket.,[buy a ticket.],0.0,66.666667
11,eplxx3whxemxs8oc,we collect,[collect],0.0,66.666667
12,hok8glgmaxuss2hq,on our website or on a third-party social medi...,[we collect information you post in a public s...,0.0,30.0
13,kem34fmbzuggp2h9,you post,[Information you post],0.0,80.0
14,vtnxyjwvd1iu3909,we collect,[we collect information],0.0,80.0


In [None]:
import pandas as pd

# Assuming 'comparison_df' is available from cell f42b6067

if 'comparison_df' in locals() and not comparison_df.empty:
    print("Analyzing incorrect predictions based on length...")

    # Calculate the length of the predicted text
    comparison_df['prediction_length'] = comparison_df['prediction_text'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0) # Length in tokens (words)

    # Calculate the length of the *first* ground truth answer (for simplicity in comparison)
    # Note: A more thorough analysis might consider the lengths of all reference answers.
    comparison_df['first_reference_length'] = comparison_df['reference_answers'].apply(lambda x: len(x[0].split()) if isinstance(x, list) and len(x) > 0 and isinstance(x[0], str) else 0) # Length in tokens (words)

    # Filter for incorrect predictions (Exact Match = 0)
    incorrect_predictions_df = comparison_df[comparison_df['exact_match'] == 0].copy() # Use .copy() to avoid SettingWithCopyWarning

    # Calculate the difference in length
    incorrect_predictions_df['length_difference'] = incorrect_predictions_df['first_reference_length'] - incorrect_predictions_df['prediction_length']

    # Sort incorrect predictions by the length difference (descending) to see cases where prediction is much shorter
    short_predictions_analysis_df = incorrect_predictions_df.sort_values(by='length_difference', ascending=False)

    print("\nIncorrect Predictions where the Predicted Answer is Much Shorter than the First Reference Answer:")

    # Display predictions where the predicted length is significantly shorter than the reference length
    # Define 'significantly shorter' as length_difference > N tokens, e.g., N=5
    min_length_difference = 5

    predictions_much_shorter_df = short_predictions_analysis_df[short_predictions_analysis_df['length_difference'] > min_length_difference]

    print(f"Found {len(predictions_much_shorter_df)} incorrect predictions where the predicted length is more than {min_length_difference} tokens shorter than the first reference answer.")

    # Display the first few examples
    display(predictions_much_shorter_df[['id', 'prediction_text', 'reference_answers', 'prediction_length', 'first_reference_length', 'length_difference']].head(10))

    # Optional: Display some statistics about length differences for incorrect predictions
    print("\nStatistics on Length Difference (Reference Length - Predicted Length) for Incorrect Predictions:")
    display(incorrect_predictions_df['length_difference'].describe())


else:
    print("Comparison DataFrame not available or empty. Cannot perform length analysis.")

Analyzing incorrect predictions based on length...

Incorrect Predictions where the Predicted Answer is Much Shorter than the First Reference Answer:
Found 695 incorrect predictions where the predicted length is more than 5 tokens shorter than the first reference answer.


Unnamed: 0,id,prediction_text,reference_answers,prediction_length,first_reference_length,length_difference
2723,hcid1m8va1znmk58,"Unless our sites and services contain the ""Pri...","[Unless our sites and services contain the ""Pr...",18,217,199
3014,yiymdcwi5hm889o0,The statements in this Privacy Policy about ou...,"[Because of federal law, we are not permitted ...",29,195,166
2667,oj74orycdqhpjcus,We cannot assume responsibility or liability f...,"[It is important to remember, however, that no...",14,174,160
1414,jvc6curen2o60h5h,law enforcement officials,[Law Enforcement. Kaleida Health may disclose ...,3,156,153
3286,226tdx1rdunb5mnw,Residents of the State of California,[Residents of the State of California may requ...,6,135,129
2719,xadcdbld6y6w90si,Children,[Our sites and services are intended for gener...,1,129,128
3423,utzleqbjyub50dsq,If you reside in California,"[If you reside in California, you may request ...",5,130,125
608,9bv19nbwx6m894xu,we may receive,"[If you choose to access, visit and/or use any...",3,126,123
2916,d7n4fxlxgj6u75n5,A permitted network advertiser,"[We allow select third parties, such as those ...",4,127,123
1378,680xrvtzzl96fa60,so they may carry out their public health acti...,[authorized public health officials (or a fore...,9,130,121



Statistics on Length Difference (Reference Length - Predicted Length) for Incorrect Predictions:


count    2658.000000
mean        3.295711
std        18.123126
min       -26.000000
25%        -5.000000
50%         0.000000
75%         6.000000
max       199.000000
Name: length_difference, dtype: float64

In [None]:
import pandas as pd

# Assuming 'predictions_much_shorter_df' is available from cell dd480c26
# Assuming 'original_dev_dataset' is available from previous cells
# Assuming 'comparison_df' is available from cell f42b6067

if 'predictions_much_shorter_df' in locals() and not predictions_much_shorter_df.empty:
    print("--- Strategic Error Analysis: Short Predictions ---")

    # Select a few strategic example IDs to investigate
    # Let's pick the one with the largest length difference, one with a moderate difference and non-zero F1, and one with low F1
    example_ids_to_investigate = [
        predictions_much_shorter_df.iloc[0]['id'], # The one with the largest length difference
        predictions_much_shorter_df[predictions_much_shorter_df['f1'] > 20].iloc[0]['id'] if not predictions_much_shorter_df[predictions_much_shorter_df['f1'] > 20].empty else None, # Moderate F1
        predictions_much_shorter_df[predictions_much_shorter_df['f1'] < 10].iloc[0]['id'] if not predictions_much_shorter_df[predictions_much_shorter_df['f1'] < 10].empty else None, # Low F1
    ]
    # Remove any None values if the filtering didn't find an example
    example_ids_to_investigate = [eid for eid in example_ids_to_investigate if eid is not None]


    # Retrieve and display the details for each selected example
    for i, example_id in enumerate(example_ids_to_investigate):
        # Find the original example in the dev dataset
        original_example = None
        for ex in original_dev_dataset:
            for para in ex["paragraphs"]:
                for qa in para["qas"]:
                    if qa["id"] == example_id:
                        original_example = {
                            "question": qa["question"],
                            "context": para["context"],
                            "answers": qa["answers"]
                        }
                        break
                if original_example:
                    break
            if original_example:
                break

        # Find the model's prediction and metrics from comparison_df
        prediction_details = comparison_df[comparison_df['id'] == example_id].iloc[0]

        if original_example:
            print(f"\n--- Investigating Example {i+1} (ID: {example_id}) ---")
            print(f"**Question:** {original_example['question']}")
            print(f"\n**Context:** ...{original_example['context']}...") # Display full context
            print(f"\n**Reference Answers:** {[ans['text'] for ans in original_example['answers']]}")
            print(f"**Model's Prediction:** '{prediction_details['prediction_text']}'")
            print(f"**Exact Match:** {prediction_details['exact_match']}")
            print(f"**F1 Score:** {prediction_details['f1']:.2f}")
            print(f"**Prediction Length:** {prediction_details['prediction_length']}, **Reference Length:** {prediction_details['first_reference_length']}")
            print("-" * 50)
        else:
            print(f"\nCould not find original example with ID: {example_id}")

else:
    print("DataFrame with short predictions ('predictions_much_shorter_df') not available or empty. Cannot perform strategic analysis.")

--- Strategic Error Analysis: Short Predictions ---

--- Investigating Example 1 (ID: hcid1m8va1znmk58) ---
**Question:** What is the company's policy towards children?

**Context:** ...Unless our sites and services contain the "Privacy Rights for California Minors in the Digital World" supplemental terms, our sites and services do not collect age from users under 18. If you reside in California and are a minor (you are under 18 years of age) and you are using a site or service that collects your age as a registration requirement and you submit content, please follow the instructions on the supplemental terms to request removal of public content.Please note that this removal does not ensure complete or comprehensive removal of the content or information posted on our sites and services if the content you posted has been shared or reposted. We are only obligated to remove content that you post, where you posted it. There are certain circumstances in which we do not have to remove your c

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_from_disk
import collections
from tqdm.auto import tqdm
import pandas as pd

# Assuming 'finetuning_path' is already defined
# Assuming 'dataset' is the original dataset loaded from cell 1a33e97c
# Assuming 'tokenizer' is already loaded from cell 9d44554b
# Assuming the best model was saved to os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_best_overall')

# Define the path where the best model was saved
best_model_save_path = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_best_overall')

print(f"Loading the best performing model from: {best_model_save_path}")
# Load the best performing model
try:
    best_model = AutoModelForQuestionAnswering.from_pretrained(best_model_save_path)
    print("Best model loaded successfully.")
except Exception as e:
    print(f"Error loading best model: {e}")
    best_model = None # Set to None if loading fails


# Define the max_length and stride used for the best model's training and tokenization
# Based on the results in cell b2f32dee
best_model_max_length = 512
best_model_stride = 128


# Re-define prepare_validation_features function (copied from previous experiment cells)
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Prepare the dev dataset features using the parameters of the best model
original_dev_dataset = dataset['dev']

if best_model is not None:
    print(f"Preparing development dataset with Max Length={best_model_max_length} and Stride={best_model_stride} for evaluation...")
    dev_features_for_analysis = original_dev_dataset.map(
        lambda examples: prepare_validation_features(examples, tokenizer, best_model_max_length, best_model_stride),
        batched=True,
        remove_columns=original_dev_dataset.column_names
    )
    print("Development dataset prepared.")

    # Run prediction once to get raw logits
    print("Running prediction on the best model to get raw logits...")
    # Use a temporary directory for output during prediction
    temp_output_dir = os.path.join(finetuning_path, 'temp_eval_output_postprocessing')
    prediction_args = TrainingArguments(
        output_dir=temp_output_dir,
        per_device_eval_batch_size=16,
        report_to="none",
    )

    trainer = Trainer(
        model=best_model,
        args=prediction_args,
        # tokenizer=tokenizer, # Removed due to TypeError
    )

    raw_predictions = trainer.predict(dev_features_for_analysis)
    print("Raw predictions obtained.")

    # Re-define postprocess_qa_predictions function (copied from f42b6067)
    # This function will be modified within the loop to use current max_answer_length and n_best_size
    def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
        all_start_logits, all_end_logits = raw_predictions.predictions

        qas_by_id = {}
        for example in examples:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

        features_per_example = collections.defaultdict(list)
        for i, feature in enumerate(features):
            features_per_example[feature["example_id"]].append(i)

        predictions = collections.OrderedDict()

        # print(f"Post-processing predictions for {len(qas_by_id)} questions from {len(features)} features.") # Too verbose in loop

        for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"): # Add progress bar
            feature_indices = features_per_example[qid]
            min_null_score = None
            valid_answers = []
            context = qa_info["context"]

            for feature_index in feature_indices:
                start_logits = all_start_logits[feature_index]
                end_logits = all_end_logits[feature_index]
                if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                     continue
                offset_mapping = features[feature_index]["offset_mapping"]
                input_ids = features[feature_index]["input_ids"]
                # Need the tokenizer here to get the CLS token ID
                if hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id in input_ids:
                    cls_index = input_ids.index(tokenizer.cls_token_id)
                    feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                    if min_null_score is None or feature_null_score < min_null_score:
                        min_null_score = feature_null_score
                else:
                    feature_null_score = -float('inf')
                    if min_null_score is None or feature_null_score < min_null_score:
                        min_null_score = feature_null_score

                start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
                end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        if (
                            start_index >= len(offset_mapping)
                            or end_index >= len(offset_mapping)
                            or offset_mapping[start_index] is None
                            or offset_mapping[end_index] is None
                        ):
                            continue
                        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                            continue
                        if hasattr(tokenizer, 'cls_token_id') and (input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id):
                             continue

                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index] + end_logits[end_index],
                                "text": context[start_char: end_char]
                            }
                        )

            if len(valid_answers) > 0:
                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            else:
                best_answer = {"text": "", "score": 0.0}

            predictions[qid] = best_answer["text"]

        return predictions

    # Configurations to test (max_answer_length, n_best_size)
    postprocessing_configs = [
        (30, 20),  # Baseline
        (50, 20),  # Increased max_answer_length
        (100, 20), # Further increased max_answer_length
        (30, 40),  # Increased n_best_size
        (30, 60),  # Further increased n_best_size
        (100, 60), # Increased both
    ]

    results_postprocessing = []

    # Need to convert dev_features_for_analysis to a list of dictionaries for postprocessing
    dev_features_list_for_postprocessing = [dev_features_for_analysis[i] for i in range(len(dev_features_for_analysis))]


    print("\n--- Running Post-processing Experiment ---")
    for max_len, n_best in postprocessing_configs:
        print(f"\nTesting Configuration: max_answer_length={max_len}, n_best_size={n_best}")

        # Apply post-processing with the current configuration
        model_predictions = postprocess_qa_predictions(
            original_dev_dataset,
            dev_features_list_for_postprocessing,
            raw_predictions,
            n_best_size=n_best,
            max_answer_length=max_len
        )
        # print("Predictions post-processed.") # Too verbose in loop

        # Prepare the references from the original dev dataset (same for all configs)
        references = []
        for example in original_dev_dataset:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    answers = qa["answers"]
                    answer_texts = [ans["text"] for ans in answers]
                    answer_starts = [ans["answer_start"] for ans in answers]
                    references.append({
                        "id": qa["id"],
                        "answers": {
                            "text": answer_texts,
                            "answer_start": answer_starts
                        }
                    })

        # Compute metrics for the current configuration
        metric = evaluate.load("squad")
        formatted_predictions_list = [{"id": k, "prediction_text": v} for k, v in model_predictions.items()]

        # Ensure references are in the correct format (list of dicts)
        # If references is already a list of dicts, no change needed.
        # If it's a Dataset object or something else, convert it.
        # Assuming 'references' list is already in the correct format from the loop above.

        scores = metric.compute(predictions=formatted_predictions_list, references=references)

        results_postprocessing.append({
            'max_answer_length': max_len,
            'n_best_size': n_best,
            'exact_match': scores['exact_match'],
            'f1': scores['f1']
        })

        print(f"Metrics: Exact Match = {scores['exact_match']:.2f}, F1 = {scores['f1']:.2f}")


    print("\n--- Summary of Post-processing Experiment Results ---")
    # Convert results to DataFrame for display
    results_df_postprocessing = pd.DataFrame(results_postprocessing)
    display(results_df_postprocessing)

else:
    print("Best model or prepared dataset not available. Skipping post-processing experiment.")

Loading the best performing model from: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-finetuned_best_overall
Best model loaded successfully.
Preparing development dataset with Max Length=512 and Stride=128 for evaluation...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Development dataset prepared.
Running prediction on the best model to get raw logits...


Raw predictions obtained.

--- Running Post-processing Experiment ---

Testing Configuration: max_answer_length=30, n_best_size=20


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.22, F1 = 58.02

Testing Configuration: max_answer_length=50, n_best_size=20


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.27, F1 = 58.67

Testing Configuration: max_answer_length=100, n_best_size=20


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.22, F1 = 58.48

Testing Configuration: max_answer_length=30, n_best_size=40


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.22, F1 = 58.02

Testing Configuration: max_answer_length=30, n_best_size=60


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.22, F1 = 58.02

Testing Configuration: max_answer_length=100, n_best_size=60


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.22, F1 = 58.48

--- Summary of Post-processing Experiment Results ---


Unnamed: 0,max_answer_length,n_best_size,exact_match,f1
0,30,20,30.217905,58.017669
1,50,20,30.270412,58.667933
2,100,20,30.217905,58.47616
3,30,40,30.217905,58.017669
4,30,60,30.217905,58.017669
5,100,60,30.217905,58.47616


### Experiment 7 (Continued): Influence of Post-processing Parameters

**Objective:** To further investigate the impact of larger `max_answer_length` values and their interaction with `n_best_size` on the model's performance.

**Fixed Parameters:**
*   Learning Rate: 5e-05
*   Number of Epochs: 5
*   Maximum Sequence Length: 512
*   Stride: 128
*   Model: The best performing model.
*   Raw Predictions: Using the same raw predictions generated in the previous run (cell `ef3ccc03`).

**Additional Configurations to Test:**

*   **Config G:** `max_answer_length = 150`, `n_best_size = 20`
*   **Config H:** `max_answer_length = 200`, `n_best_size = 20`
*   **Config I:** `max_answer_length = 150`, `n_best_size = 40`
*   **Config J:** `max_answer_length = 150`, `n_best_size = 60`
*   **Config K:** `max_answer_length = 200`, `n_best_size = 60`

The following cell will apply each of these new post-processing configurations to the raw logits obtained in the previous run (cell `ef3ccc03`) and report the metrics for each.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_from_disk
import collections
from tqdm.auto import tqdm
import pandas as pd

# Assuming 'finetuning_path' is already defined
# Assuming 'dataset' is the original dataset loaded from cell 1a33e97c
# Assuming 'tokenizer' is already loaded from cell 9d44554b
# Assuming 'best_model' is loaded from cell ef3ccc03 (or b8c375e0)
# Assuming 'dev_features_for_analysis' is prepared from cell ef3ccc03 (or b8c375e0)
# Assuming 'raw_predictions' is available from cell ef3ccc03
# Assuming 'original_dev_dataset' is available from previous cells


if 'best_model' in locals() and best_model is not None and 'dev_features_for_analysis' in locals() and 'raw_predictions' in locals():
    print("Continuing Post-processing Experiment with new configurations.")

    # Re-define postprocess_qa_predictions function (copied from ef3ccc03)
    # This function will be modified within the loop to use current max_answer_length and n_best_size
    def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
        all_start_logits, all_end_logits = raw_predictions.predictions

        qas_by_id = {}
        for example in examples:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

        features_per_example = collections.defaultdict(list)
        for i, feature in enumerate(features):
            features_per_example[feature["example_id"]].append(i)

        predictions = collections.OrderedDict()

        for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"): # Add progress bar
            feature_indices = features_per_example[qid]
            min_null_score = None
            valid_answers = []
            context = qa_info["context"]

            for feature_index in feature_indices:
                start_logits = all_start_logits[feature_index]
                end_logits = all_end_logits[feature_index]
                if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                     continue
                offset_mapping = features[feature_index]["offset_mapping"]
                input_ids = features[feature_index]["input_ids"]
                if hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id in input_ids:
                    cls_index = input_ids.index(tokenizer.cls_token_id)
                    feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                    if min_null_score is None or feature_null_score < min_null_score:
                        min_null_score = feature_null_score
                else:
                    feature_null_score = -float('inf')
                    if min_null_score is None or feature_null_score < min_null_score:
                        min_null_score = feature_null_score

                start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
                end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        if (
                            start_index >= len(offset_mapping)
                            or end_index >= len(offset_mapping)
                            or offset_mapping[start_index] is None
                            or offset_mapping[end_index] is None
                        ):
                            continue
                        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                            continue
                        if hasattr(tokenizer, 'cls_token_id') and (input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id):
                             continue

                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index] + end_logits[end_index],
                                "text": context[start_char: end_char]
                            }
                        )

            if len(valid_answers) > 0:
                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            else:
                best_answer = {"text": "", "score": 0.0}

            predictions[qid] = best_answer["text"]

        return predictions


    # Additional Configurations to test (max_answer_length, n_best_size)
    postprocessing_configs_additional = [
        (150, 20), # Config G
        (200, 20), # Config H
        (150, 40), # Config I
        (150, 60), # Config J
        (200, 60), # Config K
    ]

    # Assuming results from the first part of the experiment are stored in results_postprocessing
    # If not, initialize it or load from a saved file if available.
    if 'results_postprocessing' not in locals():
        print("Warning: 'results_postprocessing' from the previous run not found. Starting a new results list.")
        results_postprocessing = [] # Initialize if not found


    # Need to convert dev_features_for_analysis to a list of dictionaries for postprocessing
    dev_features_list_for_postprocessing = [dev_features_for_analysis[i] for i in range(len(dev_features_for_analysis))]

    print("\n--- Running Additional Post-processing Configurations ---")
    for max_len, n_best in postprocessing_configs_additional:
        print(f"\nTesting Configuration: max_answer_length={max_len}, n_best_size={n_best}")

        # Apply post-processing with the current configuration
        model_predictions = postprocess_qa_predictions(
            original_dev_dataset,
            dev_features_list_for_postprocessing,
            raw_predictions, # Use the raw predictions from the previous run
            n_best_size=n_best,
            max_answer_length=max_len
        )

        # Prepare the references from the original dev dataset (same for all configs)
        references = []
        for example in original_dev_dataset:
            for paragraph in example["paragraphs"]:
                for qa in paragraph["qas"]:
                    answers = qa["answers"]
                    answer_texts = [ans["text"] for ans in answers]
                    answer_starts = [ans["answer_start"] for ans in answers]
                    references.append({
                        "id": qa["id"],
                        "answers": {
                            "text": answer_texts,
                            "answer_start": answer_starts
                        }
                    })

        # Compute metrics for the current configuration
        metric = evaluate.load("squad")
        formatted_predictions_list = [{"id": k, "prediction_text": v} for k, v in model_predictions.items()]

        scores = metric.compute(predictions=formatted_predictions_list, references=references)

        results_postprocessing.append({ # Append to the existing results list
            'max_answer_length': max_len,
            'n_best_size': n_best,
            'exact_match': scores['exact_match'],
            'f1': scores['f1']
        })

        print(f"Metrics: Exact Match = {scores['exact_match']:.2f}, F1 = {scores['f1']:.2f}")


    print("\n--- Summary of All Post-processing Experiment Results ---")
    # Convert results to DataFrame for display
    results_df_postprocessing_combined = pd.DataFrame(results_postprocessing)
    display(results_df_postprocessing_combined.sort_values(by='f1', ascending=False)) # Display sorted by F1

else:
    print("Required variables ('best_model', 'dev_features_for_analysis', 'raw_predictions') not available. Skipping additional post-processing configurations.")

Continuing Post-processing Experiment with new configurations.

--- Running Additional Post-processing Configurations ---

Testing Configuration: max_answer_length=150, n_best_size=20


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.27, F1 = 58.43

Testing Configuration: max_answer_length=200, n_best_size=20


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.32, F1 = 58.39

Testing Configuration: max_answer_length=150, n_best_size=40


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.27, F1 = 58.43

Testing Configuration: max_answer_length=150, n_best_size=60


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.27, F1 = 58.43

Testing Configuration: max_answer_length=200, n_best_size=60


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Metrics: Exact Match = 30.32, F1 = 58.39

--- Summary of All Post-processing Experiment Results ---


Unnamed: 0,max_answer_length,n_best_size,exact_match,f1
1,50,20,30.270412,58.667933
2,100,20,30.217905,58.47616
5,100,60,30.217905,58.47616
9,150,60,30.270412,58.425672
6,150,20,30.270412,58.425672
8,150,40,30.270412,58.425672
7,200,20,30.322919,58.393188
10,200,60,30.322919,58.393188
0,30,20,30.217905,58.017669
3,30,40,30.217905,58.017669


In [None]:
import pandas as pd

# Assuming 'results_df_postprocessing_combined' is available from cell c7bba18d

if 'results_df_postprocessing_combined' in locals() and not results_df_postprocessing_combined.empty:
    print("Identifying the best post-processing configuration...")

    # Find the row with the highest F1 score
    best_postprocessing_config_row = results_df_postprocessing_combined.loc[results_df_postprocessing_combined['f1'].idxmax()]

    # Extract the best max_answer_length and n_best_size
    best_postprocessing_max_answer_length = int(best_postprocessing_config_row['max_answer_length'])
    best_postprocessing_n_best_size = int(best_postprocessing_config_row['n_best_size'])
    best_postprocessing_f1 = best_postprocessing_config_row['f1']
    best_postprocessing_em = best_postprocessing_config_row['exact_match']


    print("\n--- Best Post-processing Configuration (based on F1 Score) ---")
    print(f"Max Answer Length: {best_postprocessing_max_answer_length}")
    print(f"N-Best Size: {best_postprocessing_n_best_size}")
    print(f"Corresponding F1 Score: {best_postprocessing_f1:.2f}")
    print(f"Corresponding Exact Match: {best_postprocessing_em:.2f}")

    # Store these best parameters in variables for future use
    # These variables are now available in the Colab environment
    print("\nBest post-processing parameters stored in variables:")
    print("  - best_postprocessing_max_answer_length")
    print("  - best_postprocessing_n_best_size")


else:
    print("Post-processing results DataFrame not available or empty. Cannot identify best configuration.")

Identifying the best post-processing configuration...

--- Best Post-processing Configuration (based on F1 Score) ---
Max Answer Length: 50
N-Best Size: 20
Corresponding F1 Score: 58.67
Corresponding Exact Match: 30.27

Best post-processing parameters stored in variables:
  - best_postprocessing_max_answer_length
  - best_postprocessing_n_best_size


### Experiment 3 (Continued): Hyperparameter Tuning - Batch Size

**Objective:** To investigate the impact of different training batch sizes on the model's performance (F1 and Exact Match) using the best hyperparameters found so far for other parameters.

**Fixed Parameters:**
*   Learning Rate: 5e-05
*   Number of Epochs: 5
*   Maximum Sequence Length: 512
*   Stride: 128
*   Evaluation Post-processing Parameters: Best found so far (Max Answer Length, N-Best Size)

**Configurations to Test (Batch Size):**

*   **Batch Size = 8**
*   **Batch Size = 16** (Baseline)
*   **Batch Size = 32** (Optional - may require more GPU memory)

The following cell will run the fine-tuning and evaluation process 3 times for each tested batch size, calculate the average metrics, and report the results for comparison.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
from tqdm.auto import tqdm
import pandas as pd

# Assuming 'finetuning_path' is already defined
# Assuming 'dataset' is the original dataset loaded from cell 1a33e97c
# Assuming 'tokenizer' is already loaded from cell 9d44554b

# Define the batch sizes to test
# Include 32 as an option, but be aware of potential memory issues.
batch_sizes_to_test = [8, 16]
# Add 32 if you want to test it and have sufficient GPU memory
# batch_sizes_to_test.append(32)

# Define the fixed best hyperparameters from previous experiments
fixed_learning_rate = 5e-5
fixed_num_epochs = 5
fixed_max_length = 512
fixed_stride = 128

all_batch_size_average_results = {}

print(f"--- Starting Batch Size Experiment (LR={fixed_learning_rate}, Epochs={fixed_num_epochs}, MaxLen={fixed_max_length}, Stride={fixed_stride}) ---")


# --- Start of copied code for functions with fixes ---

# Re-define postprocess_qa_predictions function (copied from previous cells, added tokenizer arg)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, tokenizer=None):
    all_start_logits, all_end_logits = raw_predictions.predictions

    qas_by_id = {}
    for example in examples:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                qas_by_id[qa["id"]] = {"question": qa["question"], "context": paragraph["context"], "answers": qa["answers"]}

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[feature["example_id"]].append(i)

    predictions = collections.OrderedDict()

    for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"):
        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            if "offset_mapping" not in features[feature_index] or features[feature_index]["offset_mapping"] is None:
                 continue
            offset_mapping = features[feature_index]["offset_mapping"]
            input_ids = features[feature_index]["input_ids"]
            # Use tokenizer to find CLS token ID if available
            if tokenizer is not None and hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id is not None and tokenizer.cls_token_id in input_ids:
                cls_index = input_ids.index(tokenizer.cls_token_id)
                feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                if min_null_score is None or feature_null_score < min_null_score:
                    min_null_score = feature_null_score
            else:
                # Assign a very low score if no CLS token for null prediction
                feature_null_score = -float('inf')
                if min_null_score is None or feature_null_score < min_null_score:
                    min_null_score = feature_null_score


            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Check against CLS token ID if tokenizer and cls_token_id are available
                    if tokenizer is not None and hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id is not None and (input_ids[start_index] == tokenizer.cls_token_id or input_ids[end_index] == tokenizer.cls_token_id):
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        # Handle the null prediction case
        if min_null_score is not None:
            valid_answers.append({"score": min_null_score, "text": ""})


        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0} # Fallback if no valid answer or null score


        predictions[qid] = best_answer["text"]


    return predictions


# Re-define prepare_validation_features function (copied from previous cells)
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    for i in range(len(examples['paragraphs'])):
        for paragraph in examples['paragraphs'][i]:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                questions.append(question)
                contexts.append(context)
                question_ids.append(qa['id'])

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["offset_mapping"] = tokenized_examples.pop("offset_mapping")
    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]

    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from previous cells, fixed Trainer tokenizer arg)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate, num_train_epochs, max_length, stride, train_batch_size, eval_batch_size):
    print(f"\n--- Starting Run {run_number} with LR: {learning_rate}, Epochs: {num_train_epochs}, Max Length: {max_length}, Stride: {stride}, Train Batch Size: {train_batch_size}, Eval Batch Size: {eval_batch_size} ---")
    # Include all hyperparameters in the output directory name for clarity
    run_output_dir = os.path.join(finetuning_output_dir, f'lr_{learning_rate}', f'epochs_{num_train_epochs}', f'maxlen_{max_length}', f'stride_{stride}', f'trainbs_{train_batch_size}', f'evalbs_{eval_batch_size}', f'run_{run_number}')


    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to TypeError
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01, # Fixed weight decay for this experiment
        push_to_hub=False,
        report_to="none",
        save_steps=10000,
        save_total_limit=1,
        # load_best_model_at_last_step=True, # Removed due to TypeError
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
    )

    def compute_metrics(eval_pred):
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        # tokenizer=tokenizer, # REMOVED THIS ARGUMENT
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    trainer.train()
    print("Training completed.")

    print("Evaluating model...")
    raw_predictions = trainer.predict(eval_dataset)

    if not isinstance(eval_dataset, list):
         eval_features_list = [eval_dataset[i] for i in range(len(eval_dataset))]
    else:
         eval_features_list = eval_dataset

    # Use the best post-processing parameters found in Experiment 7 for evaluation here
    # Assuming best_postprocessing_max_answer_length and best_postprocessing_n_best_size are available
    # If not, define default/baseline values
    max_ans_len = 30 if 'best_postprocessing_max_answer_length' not in locals() else best_postprocessing_max_answer_length
    n_best = 20 if 'best_postprocessing_n_best_size' not in locals() else best_postprocessing_n_best_size


    predictions = postprocess_qa_predictions(original_eval_dataset, eval_features_list, raw_predictions, n_best_size=n_best, max_answer_length=max_ans_len, tokenizer=tokenizer) # Pass tokenizer


    metric = evaluate.load("squad")
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
    references = []
    for example in original_dev_dataset:
        for paragraph in example["paragraphs"]:
            for qa in paragraph["qas"]:
                answers = qa["answers"]
                answer_texts = [ans["text"] for ans in answers]
                answer_starts = [ans["answer_start"] for ans in answers]
                references.append({
                    "id": qa["id"],
                    "answers": {
                        "text": answer_texts,
                        "answer_start": answer_starts
                    }
                })

    metrics = metric.compute(predictions=formatted_predictions, references=references)
    print(f"Evaluation Metrics for Run {run_number}: {metrics}")

    return metrics

# --- End of copied code for functions ---


# Load the tokenized training dataset with labels (needed for all runs)
tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)

# Prepare the dev dataset features for evaluation (needed for all runs)
original_dev_dataset = dataset['dev']
dev_features_dataset = original_dev_dataset.map(
    lambda examples: prepare_validation_features(examples, tokenizer, fixed_max_length, fixed_stride),
    batched=True,
    remove_columns=original_dev_dataset.column_names
)


# Iterate through each batch size value
for train_bs_to_run in batch_sizes_to_test:
    print(f"\n--- Running Experiment for Batch Size: {train_bs_to_run} ---")

    # Use the same batch size for evaluation for simplicity in this experiment
    eval_bs_to_run = train_bs_to_run

    all_results_bs = []

    # Run 3 fine-tuning and evaluation cycles for the current batch size
    for run_number in range(1, 4): # Runs 1, 2, and 3
        # Load a fresh model for each run
        model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')

        # Run fine-tuning and evaluation with the specified batch size
        metrics = run_finetuning_and_evaluation(
            run_number=run_number,
            model=model,
            train_dataset=train_dataset_with_labels,
            eval_dataset=dev_features_dataset,
            original_eval_dataset=original_dev_dataset,
            tokenizer=tokenizer, # Pass tokenizer here as it's needed by postprocess_qa_predictions inside run_finetuning_and_evaluation
            finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_batch_size_experiments'), # Specific output dir for batch size
            learning_rate=fixed_learning_rate,
            num_train_epochs=fixed_num_epochs,
            max_length=fixed_max_length,
            stride=fixed_stride,
            train_batch_size=train_bs_to_run, # Pass current train batch size
            eval_batch_size=eval_bs_to_run # Pass current eval batch size
        )
        all_results_bs.append(metrics)

    # Calculate and store average metrics for the current batch size
    avg_exact_match_bs = np.mean([result['exact_match'] for result in all_results_bs])
    avg_f1_bs = np.mean([result['f1'] for result in all_results_bs])
    all_batch_size_average_results[train_bs_to_run] = {'average_exact_match': avg_exact_match_bs, 'average_f1': avg_f1_bs}

    print(f"\n--- Finished Experiment for Batch Size: {train_bs_to_run} ---")
    print(f"Average Exact Match: {avg_exact_match_bs:.2f}")
    print(f"Average F1 Score: {avg_f1_bs:.2f}")


print("\n--- Summary of Batch Size Experiment Results ---")
print(f"Fixed Parameters: LR={fixed_learning_rate}, Epochs={fixed_num_epochs}, MaxLen={fixed_max_length}, Stride={fixed_stride}")
# Use the best post-processing parameters found in Experiment 7 for evaluation metric calculation
max_ans_len_eval_note = 30 if 'best_postprocessing_max_answer_length' not in locals() else best_postprocessing_max_answer_length
n_best_eval_note = 20 if 'best_postprocessing_n_best_size' not in locals() else best_postprocessing_n_best_size
print(f"Evaluation Metrics calculated using Post-processing: Max Answer Length={max_ans_len_eval_note}, N-Best Size={n_best_eval_note}")

print("Batch Size | Average Exact Match | Average F1 Score")
print("----------|-----------------------|-------------------")
for bs, results in all_batch_size_average_results.items():
    print(f"{bs:<10}| {results['average_exact_match']:<21.2f} | {results['average_f1']:<17.2f}")

--- Starting Batch Size Experiment (LR=5e-05, Epochs=5, MaxLen=512, Stride=128) ---

--- Running Experiment for Batch Size: 8 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 8, Eval Batch Size: 8 ---
Training model...


Step,Training Loss
500,3.6643
1000,3.4425
1500,3.1279
2000,2.8566
2500,2.5706
3000,2.4554
3500,2.3805
4000,2.3488
4500,2.1665
5000,1.9946


Training completed.
Evaluating model...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 29.95536886321869, 'f1': 58.184454767774376}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 8, Eval Batch Size: 8 ---
Training model...


Step,Training Loss
500,3.6642
1000,3.4417
1500,3.1306
2000,2.8538
2500,2.5701
3000,2.443
3500,2.3746
4000,2.3459
4500,2.162
5000,1.9885


Training completed.
Evaluating model...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 30.611709110002625, 'f1': 58.40811079508497}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 8, Eval Batch Size: 8 ---
Training model...


Step,Training Loss
500,3.6643
1000,3.4418
1500,3.1475
2000,2.8701
2500,2.58
3000,2.4567
3500,2.3855
4000,2.3547
4500,2.1697
5000,1.9942


Training completed.
Evaluating model...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 30.29666579154634, 'f1': 58.3159239796338}

--- Finished Experiment for Batch Size: 8 ---
Average Exact Match: 30.29
Average F1 Score: 58.30

--- Running Experiment for Batch Size: 16 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16 ---
Training model...


Step,Training Loss
500,3.5805
1000,3.133
1500,2.6348
2000,2.4441
2500,2.1953
3000,2.1034
3500,1.9279
4000,1.8232
4500,1.7288
5000,1.6319


Training completed.
Evaluating model...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 1: {'exact_match': 29.22026778682069, 'f1': 57.284716536776656}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16 ---
Training model...


Step,Training Loss
500,3.5805
1000,3.133
1500,2.6348
2000,2.4441
2500,2.1953
3000,2.1034
3500,1.9279
4000,1.8232
4500,1.7287
5000,1.6319


Training completed.
Evaluating model...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 2: {'exact_match': 29.22026778682069, 'f1': 57.28958443099249}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16 ---
Training model...


Step,Training Loss
500,3.5805
1000,3.133
1500,2.6348
2000,2.4441
2500,2.1953
3000,2.1034
3500,1.9279
4000,1.8232
4500,1.7288
5000,1.6319


Training completed.
Evaluating model...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Evaluation Metrics for Run 3: {'exact_match': 29.22026778682069, 'f1': 57.284716536776656}

--- Finished Experiment for Batch Size: 16 ---
Average Exact Match: 29.22
Average F1 Score: 57.29

--- Summary of Batch Size Experiment Results ---
Fixed Parameters: LR=5e-05, Epochs=5, MaxLen=512, Stride=128
Evaluation Metrics calculated using Post-processing: Max Answer Length=50, N-Best Size=20
Batch Size | Average Exact Match | Average F1 Score
----------|-----------------------|-------------------
8         | 30.29                 | 58.30            
16        | 29.22                 | 57.29            


In [None]:
print(f"Best Model Configuration - Development Set Metrics:")
print(f"Exact Match: {best_postprocessing_em:.2f}")
print(f"F1 Score: {best_postprocessing_f1:.2f}")

Best Model Configuration - Development Set Metrics:
Exact Match: 30.27
F1 Score: 58.67


### Experiment 3 (Continued): Hyperparameter Tuning - Weight Decay

**Objective:** To investigate the impact of different `weight_decay` values on the model's performance (F1 and Exact Match), using the best hyperparameters found so far for other parameters. This experiment is being re-attempted with a corrected version of the training function based on successful prior runs.

**Fixed Parameters:**
*   Learning Rate: 5e-05
*   Number of Epochs: 5
*   Maximum Sequence Length: 512
*   Stride: 128
*   Batch Size (Train & Eval): 16
*   Optimizer: AdamW (Default)
*   Scheduler: Linear (Default)
*   Evaluation Post-processing Parameters: Best found so far (Max Answer Length, N-Best Size)

**Configurations to Test (Weight Decay):**

*   **Weight Decay = 0.0**
*   **Weight Decay = 0.005**
*   **Weight Decay = 0.01** (Baseline)
*   **Weight Decay = 0.05**

The following cell will run the fine-tuning and evaluation process 3 times for each tested weight decay value, calculate the average metrics, and report the results for comparison.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
# Import from collections.abc for Iterable
import collections.abc
from tqdm.auto import tqdm
import pandas as pd

# Ensure finetuning_path, dataset, and tokenizer are available from previous cells
# Example:
# finetuning_path = '/content/drive/MyDrive/policyqa_finetuning' # Adjust if necessary
# dataset = load_from_disk(os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small')) # Adjust path if necessary
# tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-small-uncased') # Adjust model if necessary

# Define the weight decay values to test
weight_decays_to_test = [0.0, 0.005, 0.01, 0.05]

# Define the fixed best hyperparameters from previous experiments
fixed_learning_rate = 5e-5
fixed_num_epochs = 5
fixed_max_length = 512
fixed_stride = 128
fixed_train_batch_size = 16 # Use batch size 16
fixed_eval_batch_size = 16   # Use batch size 16

all_weight_decay_average_results = {}

print(f"--- Starting Weight Decay Experiment (LR={fixed_learning_rate}, Epochs={fixed_num_epochs}, MaxLen={fixed_max_length}, Stride={fixed_stride}, TrainBS={fixed_train_batch_size}, EvalBS={fixed_eval_batch_size}) ---")


# --- Start of copied code for functions - Corrected Version ---

# Re-define postprocess_qa_predictions function (copied from previous cells, added tokenizer arg)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, tokenizer=None):
    # Debugging print to check raw_predictions structure
    # print(f"DEBUG: Type of raw_predictions: {type(raw_predictions)}")
    # if hasattr(raw_predictions, 'predictions'):
    #     print(f"DEBUG: Type of raw_predictions.predictions: {type(raw_predictions.predictions)}")
    #     if isinstance(raw_predictions.predictions, (list, tuple)):
    #         print(f"DEBUG: Length of raw_predictions.predictions: {len(raw_predictions.predictions)}")
    #         if len(raw_predictions.predictions) > 0 and hasattr(raw_predictions.predictions[0], 'shape'):
    #              print(f"DEBUG: Shape of raw_predictions.predictions[0]: {raw_predictions.predictions[0].shape}")
    #         if len(raw_predictions.predictions) > 1 and hasattr(raw_predictions.predictions[1], 'shape'):
    #              print(f"DEBUG: Shape of raw_predictions.predictions[1]: {raw_predictions.predictions[1].shape}")
    # else:
    #     print("DEBUG: raw_predictions does not have a .predictions attribute.")


    # Ensure raw_predictions is a tuple/list of logits, handle Trainer output structure
    all_start_logits, all_end_logits = None, None
    if hasattr(raw_predictions, 'predictions') and isinstance(raw_predictions.predictions, (tuple, list)) and len(raw_predictions.predictions) == 2:
        all_start_logits, all_end_logits = raw_predictions.predictions
    elif isinstance(raw_predictions, (tuple, list)) and len(raw_predictions) == 2:
         # Fallback if raw_predictions is already the tuple of logits directly
         all_start_logits, all_end_logits = raw_predictions
    else:
         # If structure is still unexpected, print error and return empty
         print(f"Error: Unexpected raw_predictions structure for unpacking. Expected tuple/list of length 2, but got {type(raw_predictions.predictions)} with length {len(raw_predictions.predictions) if hasattr(raw_predictions, 'predictions') and isinstance(raw_predictions.predictions, (list, tuple)) else 'N/A'}.")
         return {} # Return empty predictions


    qas_by_id = {}
    # Assuming original_dev_dataset structure is a list of dictionaries with 'paragraphs'
    for example in examples:
        if "paragraphs" in example:
            for paragraph in example["paragraphs"]:
                if "qas" in paragraph:
                    for qa in paragraph["qas"]:
                         if "id" in qa:
                            qas_by_id[qa["id"]] = {"question": qa.get("question", ""), "context": paragraph.get("context", ""), "answers": qa.get("answers", [])}


    features_per_example = collections.defaultdict(list)
    # Ensure features is iterable and has example_id
    if isinstance(features, collections.abc.Iterable): # Fixed: Use collections.abc.Iterable
        # Check if it's a Dataset object
        if hasattr(features, 'features') and 'example_id' in features.features:
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        # Check if it's a list of dictionaries/objects with 'example_id'
        elif all(hasattr(f, "example_id") for f in features):
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        else:
             print("Error: Features dataset does not contain 'example_id' in expected format. Cannot proceed with post-processing.")
             return {} # Return empty predictions if mapping fails
    else:
        print("Error: Features object is not iterable. Cannot proceed with post-processing.")
        return {}


    predictions = collections.OrderedDict()

    for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"):
        if qid not in features_per_example:
             # print(f"Warning: No features found for example ID {qid}. Skipping.") # Suppress this warning for potentially large datasets
             continue

        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            if feature_index < 0 or feature_index >= len(all_start_logits) or feature_index >= len(all_end_logits) or feature_index >= len(features):
                 print(f"Warning: Feature index {feature_index} out of bounds for logits or features. Skipping.")
                 continue

            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            feature = features[feature_index] # Get the actual feature object


            if "offset_mapping" not in feature or feature["offset_mapping"] is None:
                 # print(f"Warning: offset_mapping not found or is None for feature index {feature_index}. Skipping.") # Suppress this warning
                 continue

            offset_mapping = feature["offset_mapping"]

            # Get input_ids safely
            input_ids = feature.get("input_ids")
            if input_ids is None:
                 # print(f"Warning: input_ids not found or is None for feature index {feature_index}. Cannot check CLS token. Skipping null prediction scoring.") # Suppress this warning
                 cls_index = -1 # Invalid index
                 feature_null_score = -float('inf') # Cannot calculate null score

            else:
                 # Check against CLS token ID if tokenizer and cls_token_id are available and input_ids is not empty
                 if tokenizer is not None and hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id is not None and len(input_ids) > 0 and tokenizer.cls_token_id in input_ids:
                     try:
                         # Find all occurrences and take the first one as CLS index
                         cls_index = input_ids.index(tokenizer.cls_token_id)
                         feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                         if min_null_score is None or feature_null_score < min_null_score:
                             min_null_score = feature_null_score
                     except ValueError:
                         # Should not happen if tokenizer.cls_token_id in input_ids is true, but as a safeguard
                         cls_index = -1
                         feature_null_score = -float('inf')
                 else:
                     # Assign a very low score if no CLS token for null prediction (or input_ids missing)
                     cls_index = -1 # Invalid index
                     feature_null_score = -float('inf')
                     if min_null_score is None or feature_null_score < min_null_score:
                         min_null_score = feature_null_score


            # Ensure start/end logits and offset_mapping are non-empty and consistent size
            if not isinstance(start_logits, np.ndarray) or not isinstance(end_logits, np.ndarray) or start_logits.shape != end_logits.shape or start_logits.shape[0] != len(offset_mapping):
                 print(f"Warning: Logits or offset_mapping have inconsistent shapes for feature index {feature_index}. Skipping span predictions for this feature.")
                 continue


            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Ensure indices are within bounds of offset_mapping
                    if (
                        start_index < 0 or start_index >= len(offset_mapping)
                        or end_index < 0 or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    # Check against CLS token index if valid, or skip check if no CLS token/input_ids issue
                    if cls_index != -1 and (start_index == cls_index or end_index == cls_index):
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]

                    # Ensure character indices are within context bounds and valid
                    if start_char < 0 or end_char < 0 or start_char > end_char or end_char > len(context):
                         # print(f"Warning: Invalid character indices for feature index {feature_index}. start_char={start_char}, end_char={end_char}, context_len={len(context)}. Skipping.") # Suppress this warning
                         continue


                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        # Handle the null prediction case - Add only after checking all features for this example
        # This logic should be outside the inner feature loop but inside the qid loop
        # Determine the best answer (span or null) for the QID
        # If min_null_score was never updated (e.g., no CLS token), treat it as negative infinity
        final_min_null_score = min_null_score if min_null_score is not None else -float('inf')


        if final_min_null_score > -float('inf') and (len(valid_answers) == 0 or final_min_null_score > max(ans["score"] for ans in valid_answers)):
             # Null prediction is better than any span prediction
             best_answer = {"text": "", "score": final_min_null_score}
        elif len(valid_answers) > 0:
             # Best span prediction
             best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
             # No valid span predictions and no valid null score
             best_answer = {"text": "", "score": 0.0} # Fallback


        predictions[qid] = best_answer["text"]


    return predictions


# Re-define prepare_validation_features function (copying from previous cells)
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    # Assuming examples is a dictionary-like object with 'paragraphs' list
    if 'paragraphs' not in examples or not isinstance(examples['paragraphs'], list):
        print("Error: Examples object does not contain 'paragraphs' list in expected format.")
        return {} # Return empty if structure is wrong

    for i in range(len(examples['paragraphs'])):
        paragraph_group = examples['paragraphs'][i] # Assuming each item in outer list is a group of paragraphs for one example
        if not isinstance(paragraph_group, list):
             print(f"Warning: Item {i} in examples['paragraphs'] is not a list of paragraphs. Skipping.")
             continue

        for paragraph in paragraph_group:
            if 'context' not in paragraph or 'qas' not in paragraph or not isinstance(paragraph['qas'], list):
                 print(f"Warning: Paragraph is missing 'context' or 'qas' list. Skipping.")
                 continue

            context = paragraph['context']
            for qa in paragraph['qas']:
                 if 'question' not in qa or 'id' not in qa:
                     print(f"Warning: QA object is missing 'question' or 'id'. Skipping.")
                     continue
                 question = qa['question']
                 questions.append(question)
                 contexts.append(context)
                 question_ids.append(qa['id'])


    # Handle potential empty inputs
    if not questions or not contexts:
        print("Warning: No valid questions/contexts found in examples.")
        # Return a minimal tokenized object structure to avoid downstream errors if possible
        return {
             'input_ids': [],
             'token_type_ids': [],
             'attention_mask': [],
             'overflow_to_sample_mapping': [],
             'offset_mapping': [],
             'example_id': []
        }


    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Ensure returned keys are handled
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping", None)
    offset_mapping = tokenized_examples.pop("offset_mapping", None)

    if sample_mapping is None or offset_mapping is None:
         print("Error: Tokenization did not return required keys (overflow_to_sample_mapping or offset_mapping). Cannot proceed.")
         return {}


    tokenized_examples["offset_mapping"] = offset_mapping

    # Ensure example_id mapping is correct and handles potential issues
    if len(sample_mapping) != len(tokenized_examples['input_ids']):
         print(f"Error: Mismatch between sample_mapping size ({len(sample_mapping)}) and tokenized features size ({len(tokenized_examples['input_ids'])}). Cannot map example_ids.")
         return {}

    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]


    return tokenized_examples


# Re-define run_finetuning_and_evaluation function (copied from previous cells, ensuring Trainer tokenizer arg and load_best_model_at_last_step are removed)
def run_finetuning_and_evaluation(run_number, model, train_dataset, eval_dataset, original_eval_dataset, tokenizer, finetuning_output_dir, learning_rate, num_train_epochs, max_length, stride, train_batch_size, eval_batch_size, weight_decay): # Added weight_decay arg
    print(f"\n--- Starting Run {run_number} with LR: {learning_rate}, Epochs: {num_train_epochs}, Max Length: {max_length}, Stride: {stride}, Train Batch Size: {train_batch_size}, Eval Batch Size: {eval_batch_size}, Weight Decay: {weight_decay} ---")
    # Include all hyperparameters in the output directory name for clarity
    run_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_weight_decay_experiments', f'lr_{learning_rate}', f'epochs_{num_train_epochs}', f'maxlen_{max_length}', f'stride_{stride}', f'trainbs_{train_batch_size}', f'evalbs_{eval_batch_size}', f'wd_{weight_decay}', f'run_{run_number}')

    # Ensure output directory exists
    os.makedirs(run_output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=run_output_dir,
        # evaluation_strategy="epoch", # Removed due to potential TypeError/incompatibility in some versions/setups
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay, # Use provided weight_decay
        push_to_hub=False,
        report_to="none",
        save_steps=10000, # Save checkpoints frequently
        save_total_limit=1, # Only keep the latest checkpoint
        # load_best_model_at_last_step=True, # REMOVED THIS ARGUMENT - Potential source of TypeError/incompatibility
       # metric_for_best_model="f1",
        #greater_is_better=True,
        fp16=True, # Enable mixed precision training
        dataloader_num_workers=2, # Add workers for faster data loading
        eval_strategy="steps", # Evaluate every X steps instead of epoch end
        eval_steps=500, # Evaluate every 500 steps
        logging_dir=os.path.join(run_output_dir, 'logs'), # Logging directory
        logging_steps=100, # Log every 100 steps
    )

    def compute_metrics(eval_pred):
        # This dummy function is needed by Trainer but actual metric computation
        # happens outside Trainer.predict and inside run_finetuning_and_evaluation
        return {}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        # tokenizer=tokenizer, # REMOVED THIS ARGUMENT - This is the fix for the persistent TypeError
        compute_metrics=compute_metrics,
    )

    print("Training model...")
    # Add error handling for training
    try:
        trainer.train()
        print("Training completed.")
    except Exception as e:
        print(f"Error during training run {run_number}: {e}")
        return {'exact_match': 0.0, 'f1': 0.0} # Return zero metrics on training failure


    print("Evaluating model...")
    # Add error handling for prediction
    try:
        raw_predictions = trainer.predict(eval_dataset)
        print("Prediction completed.")
    except Exception as e:
        print(f"Error during prediction run {run_number}: {e}")
        return {'exact_match': 0.0, 'f1': 0.0} # Return zero metrics on prediction failure


    # Ensure eval_dataset passed to postprocess_qa_predictions is the tokenized features
    # And original_eval_dataset is the raw dataset for ground truth
    # Adjusted to handle potentially different types/structures of eval_dataset
    eval_features_list = eval_dataset
    if hasattr(eval_dataset, 'features') and 'example_id' in eval_dataset.features:
         # If it's a Dataset object with features, pass it as is or convert if needed by postprocess
         # Assuming postprocess_qa_predictions expects an iterable of features
         pass # eval_features_list is already the Dataset object


    # Add error handling for post-processing
    try:
        # Use the best post-processing parameters found in Experiment 7 for evaluation here
        # Ensure these variables exist, use defaults if not
        max_ans_len = 30
        n_best = 20
        if 'best_postprocessing_max_answer_length' in locals():
             max_ans_len = best_postprocessing_max_answer_length
        if 'best_postprocessing_n_best_size' in locals():
             n_best = best_postprocessing_n_best_size


        predictions = postprocess_qa_predictions(
            original_eval_dataset, # Pass the original dataset
            eval_features_list,    # Pass the tokenized features dataset/list
            raw_predictions,
            n_best_size=n_best,
            max_answer_length=max_ans_len,
            tokenizer=tokenizer # Pass tokenizer here as it's needed by postprocess_qa_predictions
        )
        print("Post-processing completed.")

    except Exception as e:
        print(f"Error during post-processing run {run_number}: {e}")
        return {'exact_match': 0.0, 'f1': 0.0} # Return zero metrics on post-processing failure


    # Add error handling for metric computation
    try:
        metric = evaluate.load("squad")
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        references = []
        # Ensure original_dev_dataset (original_eval_dataset passed to function) has expected structure
        if not isinstance(original_eval_dataset, collections.abc.Iterable): # Fixed: Use collections.abc.Iterable
             print(f"Error: Original evaluation dataset is not iterable. Cannot prepare references.")
             return {'exact_match': 0.0, 'f1': 0.0} # Return zero metrics

        for example in original_eval_dataset:
            if "paragraphs" in example:
                for paragraph in example["paragraphs"]:
                    if "qas" in paragraph:
                        for qa in paragraph["qas"]:
                            if "id" in qa and "answers" in qa:
                                answers = qa["answers"]
                                if isinstance(answers, list):
                                    answer_texts = [ans.get("text", "") for ans in answers if isinstance(ans, dict)]
                                    answer_starts = [ans.get("answer_start", -1) for ans in answers if isinstance(ans, dict)]
                                    references.append({
                                        "id": qa["id"],
                                        "answers": {
                                            "text": answer_texts,
                                            "answer_start": answer_starts
                                        }
                                    })
                                else:
                                     print(f"Warning: Answers not in expected list format for QA ID {qa.get('id', 'N/A')}. Skipping.")
                            else:
                                print(f"Warning: QA object missing 'id' or 'answers'. Skipping QA.")
                    else:
                        print(f"Warning: Paragraph missing 'qas'. Skipping Paragraph.")
            else:
                print(f"Warning: Example missing 'paragraphs'. Skipping Example.")


        # Ensure predictions and references are not empty before computing metrics
        if not formatted_predictions or not references:
             print("Warning: No predictions or references available for metric computation. Returning zero metrics.")
             return {'exact_match': 0.0, 'f1': 0.0}


        metrics = metric.compute(predictions=formatted_predictions, references=references)
        print(f"Evaluation Metrics for Run {run_number}: {metrics}")

        return metrics

    except Exception as e:
        print(f"Error during metric computation run {run_number}: {e}")
        return {'exact_match': 0.0, 'f1': 0.0} # Return zero metrics on metric computation failure


# --- End of copied code for functions ---


# Load the tokenized training dataset with labels (needed for all runs)
# Add error handling for dataset loading
try:
    tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
    if os.path.exists(tokenized_train_dataset_path):
        train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)
        print(f"Loaded training dataset from {tokenized_train_dataset_path}")
    else:
        print(f"Error: Tokenized training dataset not found at {tokenized_train_dataset_path}. Please ensure it exists.")
        train_dataset_with_labels = None # Set to None to prevent errors later

except Exception as e:
    print(f"Error loading training dataset: {e}")
    train_dataset_with_labels = None


# Prepare the dev dataset features for evaluation (needed for all runs)
# Add error handling and structure checks for dev dataset
dev_features_dataset = None # Initialize to None
original_dev_dataset_loaded = None # Initialize to None
try:
    if 'dataset' in locals() and dataset is not None and 'dev' in dataset and dataset['dev'] is not None:
        original_dev_dataset_loaded = dataset['dev']
        print("Loaded original dev dataset.")

        if original_dev_dataset_loaded is not None: # Ensure it was loaded
             dev_features_dataset = original_dev_dataset_loaded.map(
                 lambda examples: prepare_validation_features(examples, tokenizer, fixed_max_length, fixed_stride),
                 batched=True,
                 remove_columns=original_dev_dataset_loaded.column_names # Use the loaded dataset variable
             )
             print("Prepared dev features dataset.")
        else:
             print("Error: Original dev dataset is None after checking 'dataset' variable.")


    else:
        print("Error: Original dataset variable 'dataset' or 'dataset['dev']' not found or is None. Cannot prepare dev features.")


except Exception as e:
    print(f"Error preparing dev features dataset: {e}")
    dev_features_dataset = None
    original_dev_dataset_loaded = None # Reset if preparation fails


# Check if datasets are loaded before proceeding with the experiment loop
if train_dataset_with_labels is not None and dev_features_dataset is not None and original_dev_dataset_loaded is not None:

    # Iterate through each weight decay value
    for wd_to_run in weight_decays_to_test:
        print(f"\n--- Running Experiment for Weight Decay: {wd_to_run} ---")

        all_results_wd = []

        # Run 3 fine-tuning and evaluation cycles for the current weight decay
        for run_number in range(1, 4): # Runs 1, 2, and 3
            # Load a fresh model for each run
            # Add error handling for model loading
            try:
                model = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')
                print(f"Loaded fresh model for run {run_number}.")
            except Exception as e:
                print(f"Error loading model for run {run_number}: {e}. Skipping run.")
                all_results_wd.append({'exact_match': 0.0, 'f1': 0.0}) # Append zero metrics for skipped run
                continue # Skip to next run


            # Run fine-tuning and evaluation with the specified weight decay
            metrics = run_finetuning_and_evaluation(
                run_number=run_number,
                model=model,
                train_dataset=train_dataset_with_labels,
                eval_dataset=dev_features_dataset,
                original_eval_dataset=original_dev_dataset_loaded, # Pass the loaded original dev dataset
                tokenizer=tokenizer, # Pass tokenizer here as it's needed by postprocess_qa_predictions inside run_finetuning_and_evaluation
                finetuning_output_dir=os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-finetuned_weight_decay_experiments'), # Specific output dir for weight decay
                learning_rate=fixed_learning_rate,
                num_train_epochs=fixed_num_epochs,
                max_length=fixed_max_length,
                stride=fixed_stride,
                train_batch_size=fixed_train_batch_size,
                eval_batch_size=fixed_eval_batch_size,
                weight_decay=wd_to_run # Pass current weight decay
            )
            all_results_wd.append(metrics)

        # Calculate and store average metrics for the current weight decay
        # Ensure all_results_wd is not empty before calculating mean
        if all_results_wd:
             avg_exact_match_wd = np.mean([result['exact_match'] for result in all_results_wd])
             avg_f1_wd = np.mean([result['f1'] for result in all_results_wd])
             all_weight_decay_average_results[wd_to_run] = {'average_exact_match': avg_exact_match_wd, 'average_f1': avg_f1_wd}
        else:
             print(f"No successful runs for Weight Decay: {wd_to_run}. Average metrics not calculated.")
             all_weight_decay_average_results[wd_to_run] = {'average_exact_match': 0.0, 'average_f1': 0.0}


        print(f"\n--- Finished Experiment for Weight Decay: {wd_to_run} ---")
        print(f"Average Exact Match: {all_weight_decay_average_results[wd_to_run]['average_exact_match']:.2f}")
        print(f"Average F1 Score: {all_weight_decay_average_results[wd_to_run]['average_f1']:.2f}")

    # Display summary of results for all tested weight decays
    print("\n--- Summary of Weight Decay Experiment Results ---")
    print(f"Fixed Parameters: LR={fixed_learning_rate}, Epochs={fixed_num_epochs}, MaxLen={fixed_max_length}, Stride={fixed_stride}, TrainBS={fixed_train_batch_size}, EvalBS={fixed_eval_batch_size}")
    max_ans_len_eval_note = 30 if 'best_postprocessing_max_answer_length' not in locals() else best_postprocessing_max_answer_length
    n_best_eval_note = 20 if 'best_postprocessing_n_best_size' not in locals() else best_postprocessing_n_best_size
    print(f"Evaluation Metrics calculated using Post-processing: Max Answer Length={max_ans_len_eval_note}, N-Best Size={n_best_eval_note}")

    print("Weight Decay | Average Exact Match | Average F1 Score")
    print("------------|-----------------------|-------------------")
    # Sort by weight decay for consistent display
    for wd in sorted(all_weight_decay_average_results.keys()):
        results = all_weight_decay_average_results[wd]
        print(f"{wd:<12}| {results['average_exact_match']:<21.2f} | {results['average_f1']:<17.2f}")

else:
     print("\nSkipping Weight Decay Experiment loop because required datasets were not loaded successfully.")

--- Starting Weight Decay Experiment (LR=5e-05, Epochs=5, MaxLen=512, Stride=128, TrainBS=16, EvalBS=16) ---
Loaded training dataset from C:/Users/user/PLUE/PLUE-main/data\tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels\train
Loaded original dev dataset.
Prepared dev features dataset.

--- Running Experiment for Weight Decay: 0.0 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 1.

--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.0 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4243,No log
1000,2.9261,No log
1500,2.5301,No log
2000,2.3823,No log
2500,2.108,No log
3000,2.1328,No log
3500,1.8292,No log
4000,1.8127,No log
4500,1.6496,No log
5000,1.6264,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 1: {'exact_match': 28.82646363875033, 'f1': 57.297375907433626}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 2.

--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.0 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4319,No log
1000,3.0456,No log
1500,2.5792,No log
2000,2.4423,No log
2500,2.1443,No log
3000,2.1679,No log
3500,1.8593,No log
4000,1.8576,No log
4500,1.6856,No log
5000,1.66,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 2: {'exact_match': 29.0889997374639, 'f1': 56.95343147615659}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 3.

--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.0 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4323,No log
1000,3.029,No log
1500,2.5408,No log
2000,2.4342,No log
2500,2.125,No log
3000,2.1529,No log
3500,1.867,No log
4000,1.8434,No log
4500,1.6651,No log
5000,1.6685,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 3: {'exact_match': 28.800210028878972, 'f1': 56.860753356124775}

--- Finished Experiment for Weight Decay: 0.0 ---
Average Exact Match: 28.91
Average F1 Score: 57.04

--- Running Experiment for Weight Decay: 0.005 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 1.

--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.005 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4295,No log
1000,3.0381,No log
1500,2.5651,No log
2000,2.4469,No log
2500,2.1439,No log
3000,2.1585,No log
3500,1.864,No log
4000,1.8556,No log
4500,1.6818,No log
5000,1.67,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 1: {'exact_match': 28.957731688107113, 'f1': 56.74953085172992}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 2.

--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.005 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4318,No log
1000,3.0555,No log
1500,2.5687,No log
2000,2.4453,No log
2500,2.147,No log
3000,2.1533,No log
3500,1.8718,No log
4000,1.8518,No log
4500,1.6745,No log
5000,1.6651,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 2: {'exact_match': 28.957731688107113, 'f1': 57.03776557709506}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 3.

--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.005 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4292,No log
1000,3.0307,No log
1500,2.549,No log
2000,2.4325,No log
2500,2.1358,No log
3000,2.1552,No log
3500,1.8607,No log
4000,1.8489,No log
4500,1.6624,No log
5000,1.6603,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 3: {'exact_match': 28.878970858493044, 'f1': 56.84246852075826}

--- Finished Experiment for Weight Decay: 0.005 ---
Average Exact Match: 28.93
Average F1 Score: 56.88

--- Running Experiment for Weight Decay: 0.01 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 1.

--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.01 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4303,No log
1000,3.0511,No log
1500,2.5723,No log
2000,2.4405,No log
2500,2.1441,No log
3000,2.1577,No log
3500,1.8648,No log
4000,1.8618,No log
4500,1.6791,No log
5000,1.6655,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 1: {'exact_match': 29.29902861643476, 'f1': 56.9768713135671}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 2.

--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.01 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4303,No log
1000,3.0508,No log
1500,2.5768,No log
2000,2.4406,No log
2500,2.1463,No log
3000,2.1602,No log
3500,1.8612,No log
4000,1.8611,No log
4500,1.6842,No log
5000,1.6693,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 2: {'exact_match': 28.852717248621687, 'f1': 56.92011217610865}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 3.

--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.01 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4303,No log
1000,3.0514,No log
1500,2.5732,No log
2000,2.4439,No log
2500,2.1445,No log
3000,2.1608,No log
3500,1.867,No log
4000,1.8591,No log
4500,1.6815,No log
5000,1.6646,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 3: {'exact_match': 29.246521396692046, 'f1': 57.057160445525795}

--- Finished Experiment for Weight Decay: 0.01 ---
Average Exact Match: 29.13
Average F1 Score: 56.98

--- Running Experiment for Weight Decay: 0.05 ---


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 1.

--- Starting Run 1 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.05 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4293,No log
1000,3.0463,No log
1500,2.5769,No log
2000,2.443,No log
2500,2.1543,No log
3000,2.1641,No log
3500,1.8646,No log
4000,1.8542,No log
4500,1.6797,No log
5000,1.6547,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 1: {'exact_match': 28.98398529797847, 'f1': 56.927989049328296}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 2.

--- Starting Run 2 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.05 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4302,No log
1000,3.0456,No log
1500,2.5716,No log
2000,2.4416,No log
2500,2.1465,No log
3000,2.1658,No log
3500,1.8633,No log
4000,1.8467,No log
4500,1.6777,No log
5000,1.6578,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 2: {'exact_match': 29.272775006563403, 'f1': 56.863700086036424}


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for run 3.

--- Starting Run 3 with LR: 5e-05, Epochs: 5, Max Length: 512, Stride: 128, Train Batch Size: 16, Eval Batch Size: 16, Weight Decay: 0.05 ---
Training model...


Step,Training Loss,Validation Loss
500,3.4307,No log
1000,3.0464,No log
1500,2.568,No log
2000,2.4333,No log
2500,2.136,No log
3000,2.1602,No log
3500,1.8726,No log
4000,1.8524,No log
4500,1.6746,No log
5000,1.6618,No log


Training completed.
Evaluating model...


Prediction completed.


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Evaluation Metrics for Run 3: {'exact_match': 28.51142032029404, 'f1': 56.77413412388233}

--- Finished Experiment for Weight Decay: 0.05 ---
Average Exact Match: 28.92
Average F1 Score: 56.86

--- Summary of Weight Decay Experiment Results ---
Fixed Parameters: LR=5e-05, Epochs=5, MaxLen=512, Stride=128, TrainBS=16, EvalBS=16
Evaluation Metrics calculated using Post-processing: Max Answer Length=50, N-Best Size=20
Weight Decay | Average Exact Match | Average F1 Score
------------|-----------------------|-------------------
0.0         | 28.91                 | 57.04            
0.005       | 28.93                 | 56.88            
0.01        | 29.13                 | 56.98            
0.05        | 28.92                 | 56.86            


In [None]:
    # Display summary of results for all tested weight decays
    print("\n--- Summary of Weight Decay Experiment Results ---")
    print(f"Fixed Parameters: LR={fixed_learning_rate}, Epochs={fixed_num_epochs}, MaxLen={fixed_max_length}, Stride={fixed_stride}, TrainBS={fixed_train_batch_size}, EvalBS={fixed_eval_batch_size}")
    max_ans_len_eval_note = 30 if 'best_postprocessing_max_answer_length' not in locals() else best_postprocessing_max_answer_length
    n_best_eval_note = 20 if 'best_postprocessing_n_best_size' not in locals() else best_postprocessing_n_best_size
    print(f"Evaluation Metrics calculated using Post-processing: Max Answer Length={max_ans_len_eval_note}, N-Best Size={n_best_eval_note}")

    print("Weight Decay | Average Exact Match | Average F1 Score")
    print("------------|-----------------------|-------------------")
    # Sort by weight decay for consistent display
    for wd in sorted(all_weight_decay_average_results.keys()):
        results = all_weight_decay_average_results[wd]
        print(f"{wd:<12}| {results['average_exact_match']:<21.2f} | {results['average_f1']:<17.2f}")


--- Summary of Weight Decay Experiment Results ---
Fixed Parameters: LR=5e-05, Epochs=5, MaxLen=512, Stride=128, TrainBS=16, EvalBS=16
Evaluation Metrics calculated using Post-processing: Max Answer Length=50, N-Best Size=20
Weight Decay | Average Exact Match | Average F1 Score
------------|-----------------------|-------------------
0.0         | 28.91                 | 57.04            
0.005       | 28.93                 | 56.88            
0.01        | 29.13                 | 56.98            
0.05        | 28.92                 | 56.86            


### Train Final Model with User's Best Parameters

**Objective:** Train a new model instance from scratch using the user-specified "best" training parameters (LR=5e-05, Epochs=5, Max Length=512, Stride=128, Train Batch Size=8, Eval Batch Size=8, Default Weight Decay/Optimizer/Scheduler). This model will be saved to a unique directory to denote its finality.

**Training Parameters:**
*   Learning Rate: 5e-05
*   Number of Epochs: 5
*   Maximum Sequence Length: 512
*   Stride: 128
*   Train Batch Size: 8
*   Eval Batch Size: 8
*   Weight Decay: Default
*   Optimizer: Default
*   Scheduler: Default

This cell will train the model and save it to the specified final path.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
import collections.abc # Import collections.abc for Iterable
from tqdm.auto import tqdm
import pandas as pd

# Ensure finetuning_path, dataset, and tokenizer are available from previous cells
# Example:
# finetuning_path = '/content/drive/MyDrive/policyqa_finetuning' # Adjust if necessary
# dataset = load_from_disk(os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small')) # Adjust path if necessary
# tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-small-uncased') # Adjust model if necessary


# Define the user-specified best hyperparameters
user_best_learning_rate = 5e-5
user_best_num_epochs = 5
user_best_max_length = 512
user_best_stride = 128
user_best_train_batch_size = 8 # User specified
user_best_eval_batch_size = 8   # User specified
# Use defaults for Weight Decay, Optimizer, Scheduler


print(f"--- Starting Final Fine-tuning Run with User's Best Parameters ---")
print(f"Parameters: LR={user_best_learning_rate}, Epochs={user_best_num_epochs}, Max Length: {user_best_max_length}, Stride: {user_best_stride}, Train Batch Size: {user_best_train_batch_size}, Eval Batch Size: {user_best_eval_batch_size}, Weight Decay: Default")


# --- Start of copied code for functions - Including all known fixes ---
# These functions are included here for self-containment for the training process.

# Define postprocess_qa_predictions function (Needed if evaluation_strategy="steps" is used, although removed for final train)
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, tokenizer=None):
    all_start_logits, all_end_logits = None, None
    if hasattr(raw_predictions, 'predictions') and isinstance(raw_predictions.predictions, (tuple, list)) and len(raw_predictions.predictions) == 2:
        all_start_logits, all_end_logits = raw_predictions.predictions
    elif isinstance(raw_predictions, (tuple, list)) and len(raw_predictions) == 2:
         all_start_logits, all_end_logits = raw_predictions
    else:
         print(f"Error: Unexpected raw_predictions structure for unpacking. Expected tuple/list of length 2.")
         return {}


    qas_by_id = {}
    for example in examples:
        if "paragraphs" in example:
            for paragraph in example["paragraphs"]:
                if "qas" in paragraph:
                    for qa in paragraph["qas"]:
                         if "id" in qa:
                            qas_by_id[qa["id"]] = {"question": qa.get("question", ""), "context": paragraph.get("context", ""), "answers": qa.get("answers", [])}


    features_per_example = collections.defaultdict(list)
    if isinstance(features, collections.abc.Iterable):
        if hasattr(features, 'features') and 'example_id' in features.features:
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        elif all(hasattr(f, "example_id") for f in features):
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        else:
             print("Error: Features dataset does not contain 'example_id' in expected format. Cannot proceed with post-processing.")
             return {}
    else:
        print("Error: Features object is not iterable. Cannot proceed with post-processing.")
        return {}


    predictions = collections.OrderedDict()

    for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"):
        if qid not in features_per_example:
             continue

        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            if feature_index < 0 or feature_index >= len(all_start_logits) or feature_index >= len(all_end_logits) or feature_index >= len(features):
                 print(f"Warning: Feature index {feature_index} out of bounds for logits or features. Skipping.")
                 continue

            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            feature = features[feature_index]


            if "offset_mapping" not in feature or feature["offset_mapping"] is None:
                 continue

            offset_mapping = feature["offset_mapping"]

            input_ids = feature.get("input_ids")
            if input_ids is None:
                 cls_index = -1
                 feature_null_score = -float('inf')

            else:
                 if tokenizer is not None and hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id is not None and len(input_ids) > 0 and tokenizer.cls_token_id in input_ids:
                     try:
                         cls_index = input_ids.index(tokenizer.cls_token_id)
                         feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                         if min_null_score is None or feature_null_score < min_null_score:
                             min_null_score = feature_null_score
                     except ValueError:
                         cls_index = -1
                         feature_null_score = -float('inf')
                 else:
                     cls_index = -1
                     feature_null_score = -float('inf')
                     if min_null_score is None or feature_null_score < min_null_score:
                         min_null_score = feature_null_score


            if not isinstance(start_logits, np.ndarray) or not isinstance(end_logits, np.ndarray) or start_logits.shape != end_logits.shape or start_logits.shape[0] != len(offset_mapping):
                 print(f"Warning: Logits or offset_mapping have inconsistent shapes for feature index {feature_index}. Skipping span predictions for this feature.")
                 continue


            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index < 0 or start_index >= len(offset_mapping)
                        or end_index < 0 or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if cls_index != -1 and (start_index == cls_index or end_index == cls_index):
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]

                    if start_char < 0 or end_char < 0 or start_char > end_char or end_char > len(context):
                         continue


                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        final_min_null_score = min_null_score if min_null_score is not None else -float('inf')


        if final_min_null_score > -float('inf') and (len(valid_answers) == 0 or final_min_null_score > max(ans["score"] for ans in valid_answers)):
             best_answer = {"text": "", "score": final_min_null_score}
        elif len(valid_answers) > 0:
             best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
             best_answer = {"text": "", "score": 0.0}


        predictions[qid] = best_answer["text"]


    return predictions


# Define prepare_validation_features function (Needed if evaluation_strategy="steps" is used, although removed for final train)
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    if 'paragraphs' not in examples or not isinstance(examples['paragraphs'], list):
        print("Error: Examples object does not contain 'paragraphs' list in expected format.")
        return {}

    for i in range(len(examples['paragraphs'])):
        paragraph_group = examples['paragraphs'][i]
        if not isinstance(paragraph_group, list):
             print(f"Warning: Item {i} in examples['paragraphs'] is not a list of paragraphs. Skipping.")
             continue

        for paragraph in paragraph_group:
            if 'context' not in paragraph or 'qas' not in paragraph or not isinstance(paragraph['qas'], list):
                 print(f"Warning: Paragraph is missing 'context' or 'qas' list. Skipping.")
                 continue

            context = paragraph['context']
            for qa in paragraph['qas']:
                 if 'question' not in qa or 'id' not in qa:
                     print(f"Warning: QA object is missing 'question' or 'id'. Skipping.")
                     continue
                 question = qa['question']
                 questions.append(question)
                 contexts.append(context)
                 question_ids.append(qa['id'])


    if not questions or not contexts:
        print("Warning: No valid questions/contexts found in examples.")
        return {
             'input_ids': [],
             'token_type_ids': [],
             'attention_mask': [],
             'overflow_to_sample_mapping': [],
             'offset_mapping': [],
             'example_id': []
        }


    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping", None)
    offset_mapping = tokenized_examples.pop("offset_mapping", None)

    if sample_mapping is None or offset_mapping is None:
         print("Error: Tokenization did not return required keys (overflow_to_to_sample_mapping or offset_mapping). Cannot proceed.") # Corrected typo here as well
         return {}


    tokenized_examples["offset_mapping"] = offset_mapping

    if len(sample_mapping) != len(tokenized_examples['input_ids']):
         print(f"Error: Mismatch between sample_mapping size ({len(sample_mapping)}) and tokenized features size ({len(tokenized_examples['input_ids'])}). Cannot map example_ids.")
         return {}

    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]


    return tokenized_examples


# Define run_final_finetuning function - Using default optimizer/scheduler
# This function will perform the training
def run_final_finetuning_user_params_final(model, train_dataset, tokenizer, finetuning_output_dir, learning_rate, num_train_epochs, max_length, stride, train_batch_size, eval_batch_size):
    print(f"\n--- Starting Final Fine-tuning Run with User's Best Parameters ---")
    print(f"Parameters: LR={learning_rate}, Epochs={num_train_epochs}, Max Length: {max_length}, Stride: {stride}, Train Batch Size: {train_batch_size}, Eval Batch Size: {eval_batch_size}, Weight Decay: Default")


    # Define a specific output directory for this final model based on user params
    # Using a new directory name to reflect finality and user params
    final_model_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-final-best-user-params')

    os.makedirs(final_model_output_dir, exist_ok=True)


    training_args = TrainingArguments(
        output_dir=final_model_output_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size, # Corrected variable name
        num_train_epochs=num_train_epochs,
        # weight_decay is NOT set, using default
        push_to_hub=False,
        report_to="none",
        save_steps=10000, # Save checkpoints frequently
    #    save_total_limit=1, # Only keep the latest checkpoint
     #   metric_for_best_model="f1", # Metric for potentially saving best, though eval_strategy is steps
        greater_is_better=True,
        fp16=True,
        dataloader_num_workers=2,
        logging_dir=os.path.join(final_model_output_dir, 'logs'),
        logging_steps=100,
        # optim and lr_scheduler_type are NOT set, using defaults
        save_strategy="epoch", # Save checkpoint at the end of each epoch
   #     save_total_limit=num_train_epochs, # Save all epoch checkpoints for safety
    )

    def compute_metrics(eval_pred):
         return {}


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    print("Training model...")
    try:
        trainer.train()
        print("Training completed.")
        print(f"Final model checkpoint saved to {final_model_output_dir}")

        # Explicitly save the final model in a 'final_model' subdirectory
        final_model_save_path = os.path.join(final_model_output_dir, 'final_model')
        trainer.save_model(final_model_save_path)
        print(f"Final model explicitly saved to {final_model_save_path}")


    except Exception as e:
        print(f"Error during final fine-tuning run: {e}")
        print("Final model training failed. Model checkpoint may not be saved.")


# --- End of copied code for functions ---


# Load the tokenized training dataset with labels
try:
    tokenized_train_dataset_path = os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels', 'train')
    if os.path.exists(tokenized_train_dataset_path):
        train_dataset_with_labels = load_from_disk(tokenized_train_dataset_path)
        print(f"Loaded training dataset from {tokenized_train_dataset_path}")
    else:
        print(f"Error: Tokenized training dataset not found at {tokenized_train_dataset_path}. Please ensure it exists.")
        train_dataset_with_labels = None

except Exception as e:
    print(f"Error loading training dataset: {e}")
    train_dataset_with_labels = None


# Load a fresh model instance for final training
model_for_final_training = None
try:
    model_for_final_training = AutoModelForQuestionAnswering.from_pretrained('nlpaueb/legal-bert-small-uncased')
    print("Loaded fresh model for final training.")
except Exception as e:
    print(f"Error loading base model for final training: {e}. Cannot proceed.")


# Run the final fine-tuning if datasets and model are loaded and tokenizer is available
if train_dataset_with_labels is not None and model_for_final_training is not None and 'tokenizer' in locals():
    run_final_finetuning_user_params_final( # Call the new function
        model=model_for_final_training,
        train_dataset=train_dataset_with_labels,
        tokenizer=tokenizer, # Pass tokenizer
        finetuning_output_dir=finetuning_path, # Base path for output
        learning_rate=user_best_learning_rate,
        num_train_epochs=user_best_num_epochs,
        max_length=user_best_max_length,
        stride=user_best_stride,
        train_batch_size=user_best_train_batch_size,
        eval_batch_size=user_best_eval_batch_size,
    )
elif 'tokenizer' not in locals():
     print("\nSkipping Final Fine-tuning because 'tokenizer' variable is not defined. Please run previous cells to define it.")
else:
    print("\nSkipping Final Fine-tuning because required training dataset or model could not be loaded.")

# After this cell successfully runs, the fine-tuned model will be saved
# in the 'legal-bert-small-uncased-qa-final-best-user-params/final_model' directory.
# The next step will be to evaluate this model on the dev set.

--- Starting Final Fine-tuning Run with User's Best Parameters ---
Parameters: LR=5e-05, Epochs=5, Max Length: 512, Stride: 128, Train Batch Size: 8, Eval Batch Size: 8, Weight Decay: Default
Loaded training dataset from C:/Users/user/PLUE/PLUE-main/data\tokenized_datasets_policyqa_finetuning_legalbert_small_with_labels\train


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at nlpaueb/legal-bert-small-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded fresh model for final training.

--- Starting Final Fine-tuning Run with User's Best Parameters ---
Parameters: LR=5e-05, Epochs=5, Max Length: 512, Stride: 128, Train Batch Size: 8, Eval Batch Size: 8, Weight Decay: Default
Training model...


Step,Training Loss
100,4.0094
200,3.6749
300,3.5704
400,3.5282
500,3.5403
600,3.4776
700,3.4954
800,3.4602
900,3.3966
1000,3.3675


Training completed.
Final model checkpoint saved to C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-final-best-user-params
Final model explicitly saved to C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-final-best-user-params\final_model


### Evaluate Final Model on Dev Set (User's Best Parameters)

**Objective:** Load the model trained with the user-specified "best" parameters (LR=5e-05, Epochs=5, Max Length=512, Stride=128, Train/Eval Batch Size=8, Default Weight Decay/Optimizer/Scheduler) and evaluate its performance on the **dev set** using the specified post-processing parameters (Max Answer Length=50, N-Best Size=20).

**Model to Load:** The model saved from the training cell (`legal-bert-small-uncased-qa-final-best-user-params/final_model`).

**Evaluation Process:**
1.  Attempt to load the saved model and tokenizer.
2.  If successful, load the original dev dataset and prepare the dev dataset features.
3.  Run prediction on the dev features.
4.  Apply post-processing using **Max Answer Length = 50** and **N-Best Size = 20**.
5.  Compute and display the SQuAD metrics (Exact Match and F1) on the dev set.

**Note:** This cell requires the training cell (cell 1217ed22) to have successfully completed and saved the model to the defined path. If the model is not found, the evaluation cannot proceed.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
import collections.abc # Import collections.abc for Iterable
from tqdm.auto import tqdm
import pandas as pd

# Ensure finetuning_path, dataset, and tokenizer are available from previous cells
# Example:
# finetuning_path = '/content/drive/MyDrive/policyqa_finetuning' # Adjust if necessary
# dataset = load_from_disk(os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small')) # Adjust path if necessary
# tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-small-uncased') # Adjust model if necessary


# --- Start of copied code for functions - Including all known fixes ---
# Note: These functions are included here for self-containment for evaluation.

# Define postprocess_qa_predictions function - Using specified post-processing params
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, tokenizer=None): # Defaults are placeholders
    # Ensure raw_predictions is a tuple/list of logits, handle Trainer output structure
    all_start_logits, all_end_logits = None, None
    if hasattr(raw_predictions, 'predictions') and isinstance(raw_predictions.predictions, (tuple, list)) and len(raw_predictions.predictions) == 2:
        all_start_logits, all_end_logits = raw_predictions.predictions
    elif isinstance(raw_predictions, (tuple, list)) and len(raw_predictions) == 2:
         all_start_logits, all_end_logits = raw_predictions
    else:
         print(f"Error: Unexpected raw_predictions structure for unpacking. Expected tuple/list of length 2.")
         return {}


    qas_by_id = {}
    for example in examples:
        if "paragraphs" in example:
            for paragraph in example["paragraphs"]:
                if "qas" in paragraph:
                    for qa in paragraph["qas"]:
                         if "id" in qa:
                            qas_by_id[qa["id"]] = {"question": qa.get("question", ""), "context": paragraph.get("context", ""), "answers": qa.get("answers", [])}


    features_per_example = collections.defaultdict(list)
    if isinstance(features, collections.abc.Iterable):
        if hasattr(features, 'features') and 'example_id' in features.features:
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        elif all(hasattr(f, "example_id") for f in features):
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        else:
             print("Error: Features dataset does not contain 'example_id' in expected format. Cannot proceed with post-processing.")
             return {}
    else:
        print("Error: Features object is not iterable. Cannot proceed with post-processing.")
        return {}


    predictions = collections.OrderedDict()

    for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"):
        if qid not in features_per_example:
             continue

        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            if feature_index < 0 or feature_index >= len(all_start_logits) or feature_index >= len(all_end_logits) or feature_index >= len(features):
                 print(f"Warning: Feature index {feature_index} out of bounds for logits or features. Skipping.")
                 continue

            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            feature = features[feature_index]


            if "offset_mapping" not in feature or feature["offset_mapping"] is None:
                 continue

            offset_mapping = feature["offset_mapping"]

            input_ids = feature.get("input_ids")
            if input_ids is None:
                 cls_index = -1
                 feature_null_score = -float('inf')

            else:
                 if tokenizer is not None and hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id is not None and len(input_ids) > 0 and tokenizer.cls_token_id in input_ids:
                     try:
                         cls_index = input_ids.index(tokenizer.cls_token_id)
                         feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                         if min_null_score is None or feature_null_score < min_null_score:
                             min_null_score = feature_null_score
                     except ValueError:
                         cls_index = -1
                         feature_null_score = -float('inf')
                 else:
                     cls_index = -1
                     feature_null_score = -float('inf')
                     if min_null_score is None or feature_null_score < min_null_score:
                         min_null_score = feature_null_score


            if not isinstance(start_logits, np.ndarray) or not isinstance(end_logits, np.ndarray) or start_logits.shape != end_logits.shape or start_logits.shape[0] != len(offset_mapping):
                 print(f"Warning: Logits or offset_mapping have inconsistent shapes for feature index {feature_index}. Skipping span predictions for this feature.")
                 continue


            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index < 0 or start_index >= len(offset_mapping)
                        or end_index < 0 or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if cls_index != -1 and (start_index == cls_index or end_index == cls_index):
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]

                    if start_char < 0 or end_char < 0 or start_char > end_char or end_char > len(context):
                         continue


                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        final_min_null_score = min_null_score if min_null_score is not None else -float('inf')


        if final_min_null_score > -float('inf') and (len(valid_answers) == 0 or final_min_null_score > max(ans["score"] for ans in valid_answers)):
             best_answer = {"text": "", "score": final_min_null_score}
        elif len(valid_answers) > 0:
             best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
             best_answer = {"text": "", "score": 0.0}


        predictions[qid] = best_answer["text"]


    return predictions


# Define prepare_validation_features function
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    if 'paragraphs' not in examples or not isinstance(examples['paragraphs'], list):
        print("Error: Examples object does not contain 'paragraphs' list in expected format.")
        return {}

    for i in range(len(examples['paragraphs'])):
        paragraph_group = examples['paragraphs'][i]
        if not isinstance(paragraph_group, list):
             print(f"Warning: Item {i} in examples['paragraphs'] is not a list of paragraphs. Skipping.")
             continue

        for paragraph in paragraph_group:
            if 'context' not in paragraph or 'qas' not in paragraph or not isinstance(paragraph['qas'], list):
                 print(f"Warning: Paragraph is missing 'context' or 'qas' list. Skipping.")
                 continue

            context = paragraph['context']
            for qa in paragraph['qas']:
                 if 'question' not in qa or 'id' not in qa:
                     print(f"Warning: QA object is missing 'question' or 'id'. Skipping.")
                     continue
                 question = qa['question']
                 questions.append(question)
                 contexts.append(context)
                 question_ids.append(qa['id'])


    if not questions or not contexts:
        print("Warning: No valid questions/contexts found in examples.")
        return {
             'input_ids': [],
             'token_type_ids': [],
             'attention_mask': [],
             'overflow_to_sample_mapping': [],
             'offset_mapping': [],
             'example_id': []
        }


    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping", None)
    offset_mapping = tokenized_examples.pop("offset_mapping", None)

    if sample_mapping is None or offset_mapping is None:
         print("Error: Tokenization did not return required keys (overflow_to_sample_mapping or offset_mapping). Cannot proceed.")
         return {}


    tokenized_examples["offset_mapping"] = offset_mapping

    if len(sample_mapping) != len(tokenized_examples['input_ids']):
         print(f"Error: Mismatch between sample_mapping size ({len(sample_mapping)}) and tokenized features size ({len(tokenized_examples['input_ids'])}). Cannot map example_ids.")
         return {}

    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]


    return tokenized_examples

# --- End of copied code for functions ---


# Define the path where the final model with user's best params should be saved
final_model_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-final-best-user-params')
final_model_path = os.path.join(final_model_output_dir, 'final_model') # Assuming it's saved in 'final_model' subdirectory


print(f"Attempting to load model from: {final_model_path}")

# Check if the model path exists before attempting to load
if not os.path.exists(final_model_path):
    print(f"Error: Model path not found at {final_model_path}. Please ensure the training cell (cell 1217ed22) ran successfully and saved the model to this location.")
else:
    try:
        # Load the fine-tuned model and tokenizer
        model = AutoModelForQuestionAnswering.from_pretrained(final_model_path)
        print("Model loaded successfully.")

        # Assuming tokenizer is available from previous cells as 'tokenizer'
        if 'tokenizer' not in locals():
             print("Error: 'tokenizer' variable not found. Cannot proceed with evaluation.")
        else:
             # Load the original dev dataset (since user requested dev set evaluation)
             # Ensure 'dataset' variable is available from previous cells
             if 'dataset' in locals() and dataset is not None and 'dev' in dataset and dataset['dev'] is not None:
                  original_dev_dataset = dataset['dev'] # Use dev dataset
                  print("Loaded original dev dataset.")

                  # Prepare the dev dataset features for evaluation
                  # Using the user-specified Max Length and Stride for feature prep (512, 128)
                  max_len_eval = 512
                  stride_eval = 128


                  dev_features_dataset = original_dev_dataset.map(
                      lambda examples: prepare_validation_features(examples, tokenizer, max_len_eval, stride_eval),
                      batched=True,
                      remove_columns=original_dev_dataset.column_names
                  )
                  print("Prepared dev features dataset.")

                  # Set up a dummy Trainer for prediction (evaluation logic is outside Trainer)
                  # Need TrainingArguments even for prediction
                  # Using a temporary directory for prediction output
                  prediction_output_dir = os.path.join(finetuning_path, 'prediction_output_temp_final_eval_dev') # Use a unique temp dir name
                  os.makedirs(prediction_output_dir, exist_ok=True)

                  # Dummy TrainingArguments for prediction context
                  # Use the user-specified eval batch size (8)
                  prediction_args = TrainingArguments(
                      output_dir=prediction_output_dir,
                      per_device_eval_batch_size=8, # Use user-specified eval batch size (8)
                      report_to="none",
                      push_to_hub=False,
                  )


                  trainer = Trainer(
                      model=model,
                      args=prediction_args,
                      # tokenizer=tokenizer, # Removed this argument
                  )

                  print("Running prediction on dev dataset...")
                  # Run prediction
                  raw_predictions = trainer.predict(dev_features_dataset)
                  print("Prediction completed.")

                  # Apply post-processing using user-specified parameters
                  print("Applying post-processing with Max Answer Length=50, N-Best Size=20...")
                  user_max_ans_len = 50 # User specified
                  user_n_best = 20      # User specified


                  predictions = postprocess_qa_predictions(
                      original_dev_dataset, # Pass the original dataset (dev set)
                      dev_features_dataset,    # Pass the tokenized features (dev features)
                      raw_predictions,
                      n_best_size=user_n_best, # Use user-specified N-Best Size
                      max_answer_length=user_max_ans_len, # Use user-specified Max Answer Length
                      tokenizer=tokenizer # Pass tokenizer here as it's needed by postprocess_qa_predictions
                  )
                  print("Post-processing completed.")

                  # Compute metrics
                  print("Computing metrics...")
                  metric = evaluate.load("squad")
                  formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
                  references = []

                  # Prepare references (ensure original_dev_dataset structure is handled)
                  if not isinstance(original_dev_dataset, collections.abc.Iterable):
                      print(f"Error: Original dev dataset is not iterable. Cannot prepare references.")
                  else:
                      for example in original_dev_dataset:
                          if "paragraphs" in example:
                              for paragraph in example["paragraphs"]:
                                  if "qas" in paragraph:
                                      for qa in paragraph["qas"]:
                                          if "id" in qa and "answers" in qa:
                                              answers = qa["answers"]
                                              if isinstance(answers, list):
                                                  answer_texts = [ans.get("text", "") for ans in answers if isinstance(ans, dict)]
                                                  answer_starts = [ans.get("answer_start", -1) for ans in answers if isinstance(ans, dict)]
                                                  references.append({
                                                      "id": qa["id"],
                                                      "answers": {
                                                          "text": answer_texts,
                                                          "answer_start": answer_starts
                                                      }
                                                  })
                                              else:
                                                   print(f"Warning: Answers not in expected list format for QA ID {qa.get('id', 'N/A')}. Skipping.")
                                          else:
                                              print(f"Warning: QA object missing 'id' or 'answers'. Skipping QA.")
                                  else:
                                      print(f"Warning: Paragraph missing 'qas'. Skipping Paragraph.")
                          else:
                              print(f"Warning: Example missing 'paragraphs'. Skipping Example.")


                  # Ensure predictions and references are not empty before computing metrics
                  if formatted_predictions and references:
                      metrics = metric.compute(predictions=formatted_predictions, references=references)
                      print("\n--- Evaluation Results on Dev Set (Final User's Best Params) ---")
                      print(f"Exact Match: {metrics['exact_match']:.2f}")
                      print(f"F1 Score: {metrics['f1']:.2f}")
                      print("-" * 50)
                  else:
                      print("\nWarning: No predictions or references available for metric computation.")

    except Exception as e:
        print(f"An error occurred during evaluation: {e}")
        print("Evaluation failed. Please check the error message and ensure the model was trained and saved correctly.")

# Clean up temporary prediction output directory
# import shutil
# if os.path.exists(prediction_output_dir):
#      shutil.rmtree(prediction_output_dir)
#      print(f"Cleaned up temporary prediction output directory: {prediction_output_dir}")

Attempting to load model from: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-final-best-user-params\final_model
Model loaded successfully.
Loaded original dev dataset.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Prepared dev features dataset.
Running prediction on dev dataset...


Prediction completed.
Applying post-processing with Max Answer Length=50, N-Best Size=20...


Post-processing:   0%|          | 0/3809 [00:00<?, ?it/s]

Post-processing completed.
Computing metrics...

--- Evaluation Results on Dev Set (Final User's Best Params) ---
Exact Match: 30.01
F1 Score: 58.25
--------------------------------------------------


### Evaluate Final Model on Test Set (User's Best Parameters)

**Objective:** Load the model trained with the user-specified "best" parameters (LR=5e-05, Epochs=5, Max Length=512, Stride=128, Train/Eval Batch Size=8, Default Weight Decay/Optimizer/Scheduler) and evaluate its performance on the **test set** using the specified post-processing parameters (Max Answer Length=50, N-Best Size=20).

**Model to Load:** The model saved from the training cell (`legal-bert-small-uncased-qa-final-best-user-params/final_model`).

**Evaluation Process:**
1.  Attempt to load the saved model and tokenizer.
2.  If successful, load the original test dataset and prepare the test dataset features.
3.  Run prediction on the test features.
4.  Apply post-processing using **Max Answer Length = 50** and **N-Best Size = 20**.
5.  Compute and display the SQuAD metrics (Exact Match and F1) on the **test set**.

**Note:** This cell requires the training cell (cell 1217ed22) to have successfully completed and saved the model to the defined path. If the model is not found, the evaluation cannot proceed.

In [None]:
import os
import evaluate
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer
import collections
import collections.abc # Import collections.abc for Iterable
from tqdm.auto import tqdm
import pandas as pd

# Ensure finetuning_path, dataset, and tokenizer are available from previous cells
# Example:
# finetuning_path = '/content/drive/MyDrive/policyqa_finetuning' # Adjust if necessary
# dataset = load_from_disk(os.path.join(finetuning_path, 'tokenized_datasets_policyqa_finetuning_legalbert_small')) # Adjust path if necessary
# tokenizer = AutoTokenizer.from_pretrained('nlpaueb/legal-bert-small-uncased') # Adjust model if necessary


# --- Start of copied code for functions - Including all known fixes ---
# Note: These functions are included here for self-containment for evaluation.

# Define postprocess_qa_predictions function - Using specified post-processing params
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30, tokenizer=None): # Defaults are placeholders
    # Ensure raw_predictions is a tuple/list of logits, handle Trainer output structure
    all_start_logits, all_end_logits = None, None
    if hasattr(raw_predictions, 'predictions') and isinstance(raw_predictions.predictions, (tuple, list)) and len(raw_predictions.predictions) == 2:
        all_start_logits, all_end_logits = raw_predictions.predictions
    elif isinstance(raw_predictions, (tuple, list)) and len(raw_predictions) == 2:
         all_start_logits, all_end_logits = raw_predictions
    else:
         print(f"Error: Unexpected raw_predictions structure for unpacking. Expected tuple/list of length 2.")
         return {}


    qas_by_id = {}
    for example in examples:
        if "paragraphs" in example:
            for paragraph in example["paragraphs"]:
                if "qas" in paragraph:
                    for qa in paragraph["qas"]:
                         if "id" in qa:
                            qas_by_id[qa["id"]] = {"question": qa.get("question", ""), "context": paragraph.get("context", ""), "answers": qa.get("answers", [])}


    features_per_example = collections.defaultdict(list)
    if isinstance(features, collections.abc.Iterable):
        if hasattr(features, 'features') and 'example_id' in features.features:
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        elif all(hasattr(f, "example_id") for f in features):
             for i, feature in enumerate(features):
                 features_per_example[feature["example_id"]].append(i)
        else:
             print("Error: Features dataset does not contain 'example_id' in expected format. Cannot proceed with post-processing.")
             return {}
    else:
        print("Error: Features object is not iterable. Cannot proceed with post-processing.")
        return {}


    predictions = collections.OrderedDict()

    for qid, qa_info in tqdm(qas_by_id.items(), desc="Post-processing"):
        if qid not in features_per_example:
             continue

        feature_indices = features_per_example[qid]
        min_null_score = None
        valid_answers = []
        context = qa_info["context"]

        for feature_index in feature_indices:
            if feature_index < 0 or feature_index >= len(all_start_logits) or feature_index >= len(all_end_logits) or feature_index >= len(features):
                 print(f"Warning: Feature index {feature_index} out of bounds for logits or features. Skipping.")
                 continue

            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            feature = features[feature_index]


            if "offset_mapping" not in feature or feature["offset_mapping"] is None:
                 continue

            offset_mapping = feature["offset_mapping"]

            input_ids = feature.get("input_ids")
            if input_ids is None:
                 cls_index = -1
                 feature_null_score = -float('inf')

            else:
                 if tokenizer is not None and hasattr(tokenizer, 'cls_token_id') and tokenizer.cls_token_id is not None and len(input_ids) > 0 and tokenizer.cls_token_id in input_ids:
                     try:
                         cls_index = input_ids.index(tokenizer.cls_token_id)
                         feature_null_score = start_logits[cls_index] + end_logits[cls_index]
                         if min_null_score is None or feature_null_score < min_null_score:
                             min_null_score = feature_null_score
                     except ValueError:
                         cls_index = -1
                         feature_null_score = -float('inf')
                 else:
                     cls_index = -1
                     feature_null_score = -float('inf')
                     if min_null_score is None or feature_null_score < min_null_score:
                         min_null_score = feature_null_score


            if not isinstance(start_logits, np.ndarray) or not isinstance(end_logits, np.ndarray) or start_logits.shape != end_logits.shape or start_logits.shape[0] != len(offset_mapping):
                 print(f"Warning: Logits or offset_mapping have inconsistent shapes for feature index {feature_index}. Skipping span predictions for this feature.")
                 continue


            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index < 0 or start_index >= len(offset_mapping)
                        or end_index < 0 or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    if cls_index != -1 and (start_index == cls_index or end_index == cls_index):
                         continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]

                    if start_char < 0 or end_char < 0 or start_char > end_char or end_char > len(context):
                         continue


                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        final_min_null_score = min_null_score if min_null_score is not None else -float('inf')


        if final_min_null_score > -float('inf') and (len(valid_answers) == 0 or final_min_null_score > max(ans["score"] for ans in valid_answers)):
             best_answer = {"text": "", "score": final_min_null_score}
        elif len(valid_answers) > 0:
             best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
             best_answer = {"text": "", "score": 0.0}


        predictions[qid] = best_answer["text"]


    return predictions


# Define prepare_validation_features function
def prepare_validation_features(examples, tokenizer, max_length, stride):
    questions = []
    contexts = []
    question_ids = []

    if 'paragraphs' not in examples or not isinstance(examples['paragraphs'], list):
        print("Error: Examples object does not contain 'paragraphs' list in expected format.")
        return {}

    for i in range(len(examples['paragraphs'])):
        paragraph_group = examples['paragraphs'][i]
        if not isinstance(paragraph_group, list):
             print(f"Warning: Item {i} in examples['paragraphs'] is not a list of paragraphs. Skipping.")
             continue

        for paragraph in paragraph_group:
            if 'context' not in paragraph or 'qas' not in paragraph or not isinstance(paragraph['qas'], list):
                 print(f"Warning: Paragraph is missing 'context' or 'qas' list. Skipping.")
                 continue

            context = paragraph['context']
            for qa in paragraph['qas']:
                 if 'question' not in qa or 'id' not in qa:
                     print(f"Warning: QA object is missing 'question' or 'id'. Skipping.")
                     continue
                 question = qa['question']
                 questions.append(question)
                 contexts.append(context)
                 question_ids.append(qa['id'])


    if not questions or not contexts:
        print("Warning: No valid questions/contexts found in examples.")
        return {
             'input_ids': [],
             'token_type_ids': [],
             'attention_mask': [],
             'overflow_to_sample_mapping': [],
             'offset_mapping': [],
             'example_id': []
        }


    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping", None)
    offset_mapping = tokenized_examples.pop("offset_mapping", None)

    if sample_mapping is None or offset_mapping is None:
         print("Error: Tokenization did not return required keys (overflow_to_sample_mapping or offset_mapping). Cannot proceed.")
         return {}


    tokenized_examples["offset_mapping"] = offset_mapping

    if len(sample_mapping) != len(tokenized_examples['input_ids']):
         print(f"Error: Mismatch between sample_mapping size ({len(sample_mapping)}) and tokenized features size ({len(tokenized_examples['input_ids'])}). Cannot map example_ids.")
         return {}

    tokenized_examples["example_id"] = [question_ids[i] for i in sample_mapping]


    return tokenized_examples

# --- End of copied code for functions ---


# Define the path where the final model with user's best params should be saved
final_model_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-final-best-user-params')
final_model_path = os.path.join(final_model_output_dir, 'final_model') # Assuming it's saved in 'final_model' subdirectory


print(f"Attempting to load model from: {final_model_path}")

# Check if the model path exists before attempting to load
if not os.path.exists(final_model_path):
    print(f"Error: Model path not found at {final_model_path}. Please ensure the training cell (cell 1217ed22) ran successfully and saved the model to this location.")
else:
    try:
        # Load the fine-tuned model and tokenizer
        model = AutoModelForQuestionAnswering.from_pretrained(final_model_path)
        print("Model loaded successfully.")

        # Assuming tokenizer is available from previous cells as 'tokenizer'
        if 'tokenizer' not in locals():
             print("Error: 'tokenizer' variable not found. Cannot proceed with evaluation.")
        else:
             # Load the original test dataset (since user requested test set evaluation)
             # Ensure 'dataset' variable is available from previous cells
             if 'dataset' in locals() and dataset is not None and 'test' in dataset and dataset['test'] is not None:
                  original_test_dataset = dataset['test'] # Use test dataset
                  print("Loaded original test dataset.")

                  # Prepare the test dataset features for evaluation
                  # Using the user-specified Max Length and Stride for feature prep (512, 128)
                  max_len_eval = 512
                  stride_eval = 128


                  test_features_dataset = original_test_dataset.map(
                      lambda examples: prepare_validation_features(examples, tokenizer, max_len_eval, stride_eval),
                      batched=True,
                      remove_columns=original_test_dataset.column_names
                  )
                  print("Prepared test features dataset.")

                  # Set up a dummy Trainer for prediction (evaluation logic is outside Trainer)
                  # Need TrainingArguments even for prediction
                  # Using a temporary directory for prediction output
                  prediction_output_dir = os.path.join(finetuning_path, 'prediction_output_temp_final_eval_test') # Use a unique temp dir name
                  os.makedirs(prediction_output_dir, exist_ok=True)

                  # Dummy TrainingArguments for prediction context
                  # Use the user-specified eval batch size (8)
                  prediction_args = TrainingArguments(
                      output_dir=prediction_output_dir,
                      per_device_eval_batch_size=8, # Use user-specified eval batch size (8)
                      report_to="none",
                      push_to_hub=False,
                  )


                  trainer = Trainer(
                      model=model,
                      args=prediction_args,
                      # tokenizer=tokenizer, # Removed this argument
                  )

                  print("Running prediction on test dataset...")
                  # Run prediction
                  raw_predictions = trainer.predict(test_features_dataset)
                  print("Prediction completed.")

                  # Apply post-processing using user-specified parameters
                  print("Applying post-processing with Max Answer Length=50, N-Best Size=20...")
                  user_max_ans_len = 50 # User specified
                  user_n_best = 20      # User specified


                  predictions = postprocess_qa_predictions(
                      original_test_dataset, # Pass the original dataset (test set)
                      test_features_dataset,    # Pass the tokenized features (test features)
                      raw_predictions,
                      n_best_size=user_n_best, # Use user-specified N-Best Size
                      max_answer_length=user_max_ans_len, # Use user-specified Max Answer Length
                      tokenizer=tokenizer # Pass tokenizer here as it's needed by postprocess_qa_predictions
                  )
                  print("Post-processing completed.")

                  # Compute metrics
                  print("Computing metrics...")
                  metric = evaluate.load("squad")
                  formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
                  references = []

                  # Prepare references (ensure original_test_dataset structure is handled)
                  if not isinstance(original_test_dataset, collections.abc.Iterable):
                      print(f"Error: Original test dataset is not iterable. Cannot prepare references.")
                  else:
                      for example in original_test_dataset:
                          if "paragraphs" in example:
                              for paragraph in example["paragraphs"]:
                                  if "qas" in paragraph:
                                      for qa in paragraph["qas"]:
                                          if "id" in qa and "answers" in qa:
                                              answers = qa["answers"]
                                              if isinstance(answers, list):
                                                  answer_texts = [ans.get("text", "") for ans in answers if isinstance(ans, dict)]
                                                  answer_starts = [ans.get("answer_start", -1) for ans in answers if isinstance(ans, dict)]
                                                  references.append({
                                                      "id": qa["id"],
                                                      "answers": {
                                                          "text": answer_texts,
                                                          "answer_start": answer_starts
                                                      }
                                                  })
                                              else:
                                                   print(f"Warning: Answers not in expected list format for QA ID {qa.get('id', 'N/A')}. Skipping.")
                                          else:
                                              print(f"Warning: QA object missing 'id' or 'answers'. Skipping QA.")
                                  else:
                                      print(f"Warning: Paragraph missing 'qas'. Skipping Paragraph.")
                          else:
                              print(f"Warning: Example missing 'paragraphs'. Skipping Example.")


                  # Ensure predictions and references are not empty before computing metrics
                  if formatted_predictions and references:
                      metrics = metric.compute(predictions=formatted_predictions, references=references)
                      print("\n--- Evaluation Results on Test Set (Final User's Best Params) ---")
                      print(f"Exact Match: {metrics['exact_match']:.2f}")
                      print(f"F1 Score: {metrics['f1']:.2f}")
                      print("-" * 50)
                  else:
                      print("\nWarning: No predictions or references available for metric computation.")

    except Exception as e:
        print(f"An error occurred during evaluation: {e}")
        print("Evaluation failed. Please check the error message and ensure the model was trained and saved correctly.")

# Clean up temporary prediction output directory
# import shutil
# if os.path.exists(prediction_output_dir):
#      shutil.rmtree(prediction_output_dir)
#      print(f"Cleaned up temporary prediction output directory: {prediction_output_dir}")

Attempting to load model from: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-final-best-user-params\final_model
Model loaded successfully.
Loaded original test dataset.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Prepared test features dataset.
Running prediction on test dataset...


Prediction completed.
Applying post-processing with Max Answer Length=50, N-Best Size=20...


Post-processing:   0%|          | 0/4152 [00:00<?, ?it/s]

Post-processing completed.
Computing metrics...

--- Evaluation Results on Test Set (Final User's Best Params) ---
Exact Match: 27.26
F1 Score: 54.90
--------------------------------------------------


### Error Analysis - Step 2: Match Predictions to Ground Truth

**Objective:** Load the original test dataset and the model's predictions from the final test set evaluation (cell `b1051eab`) and match them by question ID to prepare a structured list for analysis.

This step will create a list where each item contains the question, context, ground truth answers, and the model's prediction for a specific example.

In [None]:
import collections
import collections.abc # Ensure collections.abc is imported
import pandas as pd
from datasets import load_from_disk # Needed if reloading dataset

# Ensure dataset (containing the original test set) and predictions (from cell b1051eab) are available
# Example:
# dataset = load_from_disk(...) # Load your tokenized dataset containing original splits
# predictions = { ... } # This should be the dictionary output by cell b1051eab


print("--- Error Analysis - Matching Predictions to Ground Truth ---")

# Check if 'dataset' and 'predictions' are available
if 'dataset' not in locals() or dataset is None or 'test' not in dataset or dataset['test'] is None:
    print("Error: The 'dataset' variable (containing the test set) is not available or correctly loaded.")
    print("Please ensure you have run the cells that load and prepare the original dataset.")
elif 'predictions' not in locals() or not isinstance(predictions, dict):
    print("Error: The 'predictions' variable from the test set evaluation (cell b1051eab) is not available or is not a dictionary.")
    print("Please ensure cell b1051eab has run successfully and that the 'predictions' variable is accessible.")
else:
    try:
        original_test_dataset = dataset['test']
        print("Loaded original test dataset.")
        print(f"Number of examples in original test dataset: {len(original_test_dataset)}")
        print(f"Number of predictions available: {len(predictions)}")


        # Create a list to store matched data for analysis
        matched_data = []

        # Build a dictionary for easy lookup of ground truth answers by qid
        # Need to handle the structure of the original dataset ['paragraphs'][i]['qas'][j]['id'] and ['answers']
        ground_truth_answers_by_qid = {}
        if not isinstance(original_test_dataset, collections.abc.Iterable):
             print(f"Error: Original test dataset is not iterable. Cannot build ground truth mapping.")
        else:
             for example in original_test_dataset:
                 if "paragraphs" in example:
                     for paragraph in example["paragraphs"]:
                         if "qas" in paragraph:
                             for qa in paragraph["qas"]:
                                 if "id" in qa and "answers" in qa:
                                     # Ensure answers is a list of dictionaries
                                     answers = qa["answers"]
                                     if isinstance(answers, list):
                                         # Extract text and start for each answer
                                         ground_truth_answers = [{"text": ans.get("text", ""), "answer_start": ans.get("answer_start", -1)} for ans in answers if isinstance(ans, dict)]
                                         ground_truth_answers_by_qid[qa["id"]] = ground_truth_answers
                                     else:
                                          print(f"Warning: Answers not in expected list format for QA ID {qa.get('id', 'N/A')}. Skipping ground truth for this QA.")
                                 else:
                                     print(f"Warning: QA object missing 'id' or 'answers'. Skipping QA.")
                         else:
                             print(f"Warning: Paragraph missing 'qas'. Skipping Paragraph.")
                 else:
                     print(f"Warning: Example missing 'paragraphs'. Skipping Example.")

        print(f"Built ground truth mapping for {len(ground_truth_answers_by_qid)} question IDs.")


        # Match predictions with ground truth and original data
        # Iterate through the predictions dictionary as it's keyed by qid
        for qid, prediction_text in tqdm(predictions.items(), desc="Matching predictions and ground truth"):
            # Find the corresponding ground truth answers
            ground_truth_answers = ground_truth_answers_by_qid.get(qid, [])

            # We need the original question and context as well for analysis
            # This requires iterating through the original dataset again or having a qid -> example/paragraph map
            # Let's build a map for faster lookup
            original_qa_info = None
            # This part is inefficient, ideally the dataset would be indexed differently or we use the features
            # For simplicity here, we'll search, but a more robust approach would preprocess this.
            # A faster way is to use the features dataset's example_id mapping back to original dataset,
            # but the original dataset structure needs careful handling.

            # Let's try to find the question and context by iterating through the original dataset
            question_text = "N/A"
            context_text = "N/A"
            found_qa = False

            if isinstance(original_test_dataset, collections.abc.Iterable):
                 for example in original_test_dataset:
                     if found_qa: break
                     if "paragraphs" in example:
                         for paragraph in example["paragraphs"]:
                             if "qas" in paragraph:
                                 for qa in paragraph["qas"]:
                                     if "id" in qa and qa["id"] == qid:
                                         question_text = qa.get("question", "N/A")
                                         context_text = paragraph.get("context", "N/A")
                                         found_qa = True
                                         break # Found the QA

            if not found_qa:
                 print(f"Warning: Could not find original question/context for QA ID {qid}. Skipping.")
                 continue


            matched_data.append({
                "qid": qid,
                "question": question_text,
                "context": context_text,
                "ground_truth_answers": ground_truth_answers, # List of dicts {"text": ..., "answer_start": ...}
                "prediction": prediction_text
            })

        print(f"Successfully matched data for {len(matched_data)} examples.")

        # You can now inspect 'matched_data' for error analysis
        # For example, display the first few entries:
        # display(pd.DataFrame(matched_data).head()) # Requires pandas

        # Or print the first few for inspection
        # print("\nFirst 5 matched examples:")
        # for i, item in enumerate(matched_data[:5]):
        #      print(f"\nQID: {item['qid']}")
        #      print(f"Question: {item['question']}")
        #      print(f"Context: {item['context'][:200]}...") # Print first 200 chars of context
        #      print(f"Ground Truth Answers: {item['ground_truth_answers']}")
        #      print(f"Prediction: {item['prediction']}")


    except Exception as e:
        print(f"An error occurred during matching predictions and ground truth: {e}")
        print("Matching process failed.")

# The 'matched_data' variable now contains the structured data for error analysis (Step 2 complete).
# Next steps involve analyzing this data (Step 3 onwards).

--- Error Analysis - Matching Predictions to Ground Truth ---
Loaded original test dataset.
Number of examples in original test dataset: 20
Number of predictions available: 4152
Built ground truth mapping for 4152 question IDs.


Matching predictions and ground truth:   0%|          | 0/4152 [00:00<?, ?it/s]

Successfully matched data for 4152 examples.


### Error Analysis - Step 3: Identify Correct and Incorrect Predictions

**Objective:** Iterate through the `matched_data` prepared in Step 2, compare each model prediction to the ground truth answer(s), and determine if the prediction is exactly correct (Exact Match) or has partial overlap (contributing to F1). Add this correctness status to each entry.

This step makes it easy to filter and analyze correct vs. incorrect predictions.

In [None]:
import collections
import string
import re
from tqdm.auto import tqdm

# Ensure matched_data is available from Step 2 (cell 045de0df)

print("--- Error Analysis - Identifying Correct and Incorrect Predictions ---")

# Check if 'matched_data' is available
if 'matched_data' not in locals() or not isinstance(matched_data, list):
    print("Error: The 'matched_data' variable from Step 2 (cell 045de0df) is not available or is not a list.")
    print("Please ensure cell 045de0df has run successfully.")
else:
    try:
        # Helper functions for SQuAD-like comparison (Exact Match and F1)
        def normalize_answer(s):
            """Lower text and remove punctuation, articles and extra whitespace."""
            def remove_articles(text):
                return re.sub(r'\b(a|an|the)\b', ' ', text)
            def white_space_fix(text):
                return ' '.join(text.split())
            def remove_punc(text):
                exclude = set(string.punctuation)
                return ''.join(ch for ch in text if ch not in exclude)
            def lower(text):
                return text.lower()
            return white_space_fix(remove_articles(remove_punc(lower(s))))

        def get_tokens(s):
            if not s: return []
            return normalize_answer(s).split()

        def compute_exact(a_gold, a_pred):
            return int(normalize_answer(a_gold) == normalize_answer(a_pred))

        def compute_f1(a_gold, a_pred):
            gold_toks = get_tokens(a_gold)
            pred_toks = get_tokens(a_pred)
            common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
            num_common = sum(common.values())
            if len(gold_toks) == 0 and len(pred_toks) == 0:
                return 1.0
            if len(gold_toks) == 0 or len(pred_toks) == 0:
                return 0.0
            precision = num_common / len(pred_toks)
            recall = num_common / len(gold_toks)
            return 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0


        # Iterate through matched_data and determine correctness
        for item in tqdm(matched_data, desc="Identifying correctness"):
            prediction = item["prediction"]
            ground_truth_answers = item["ground_truth_answers"] # This is a list of {"text": ..., "answer_start": ...}

            # Determine Exact Match (EM) - prediction must exactly match *one* of the ground truth answers
            is_exact_match = False
            if ground_truth_answers:
                is_exact_match = max(compute_exact(gt["text"], prediction) for gt in ground_truth_answers) == 1

            # Determine F1 Score (using the best F1 score across all ground truth answers)
            f1_score = 0.0
            if ground_truth_answers:
                 f1_score = max(compute_f1(gt["text"], prediction) for gt in ground_truth_answers)

            # Add correctness status to the item
            item["is_exact_match"] = is_exact_match
            item["f1_score"] = f1_score # Store the F1 score for potential partial credit analysis

            # Optional: Add a simple status like 'correct', 'partial', 'incorrect'
            if is_exact_match:
                 item["correctness_status"] = "exact_match"
            elif f1_score > 0: # Consider any overlap as partial credit
                 item["correctness_status"] = "partial_match"
            else:
                 item["correctness_status"] = "incorrect"


        print(f"Correctness identification complete for {len(matched_data)} examples.")

        # You can now filter matched_data based on "is_exact_match", "f1_score", or "correctness_status"
        # For example, to see incorrect predictions:
        # incorrect_predictions = [item for item in matched_data if item["correctness_status"] == "incorrect"]
        # print(f"\nNumber of incorrect predictions: {len(incorrect_predictions)}")

        # Or to see partial matches:
        # partial_matches = [item for item in matched_data if item["correctness_status"] == "partial_match"]
        # print(f"Number of partial matches: {len(partial_matches)}")


    except Exception as e:
        print(f"An error occurred during correctness identification: {e}")
        print("Correctness identification failed.")

# The 'matched_data' list is now updated with correctness information (Step 3 complete).
# Next steps involve analyzing these results and categorizing errors (Step 4 onwards).

--- Error Analysis - Identifying Correct and Incorrect Predictions ---


Identifying correctness:   0%|          | 0/4152 [00:00<?, ?it/s]

Correctness identification complete for 4152 examples.


### Error Analysis - Step 4: Categorize Errors (Inspection)

**Objective:** Filter the `matched_data` based on correctness status and inspect samples of incorrect and partial predictions to identify common error categories. This is a manual analysis step aided by the code below.

The code will display counts and samples of predictions that were not exact matches.

In [None]:
import pandas as pd

# Ensure matched_data is available and has been processed in Step 3 (cell 18c7766b)

print("--- Error Analysis - Inspecting Incorrect and Partial Predictions ---")

# Check if 'matched_data' is available and has correctness status
if ('matched_data' not in locals() or not isinstance(matched_data, list) or
    not all('correctness_status' in item for item in matched_data)):
    print("Error: 'matched_data' is not available or missing correctness status.")
    print("Please ensure Step 2 (cell 045de0df) and Step 3 (cell 18c7766b) have run successfully.")
else:
    try:
        # Filter data based on correctness status
        incorrect_predictions = [item for item in matched_data if item["correctness_status"] == "incorrect"]
        partial_matches = [item for item in matched_data if item["correctness_status"] == "partial_match"]
        exact_matches = [item for item in matched_data if item["correctness_status"] == "exact_match"]

        print(f"Total examples analyzed: {len(matched_data)}")
        print(f"Number of Exact Matches: {len(exact_matches)}")
        print(f"Number of Partial Matches (F1 > 0, EM = 0): {len(partial_matches)}")
        print(f"Number of Incorrect Predictions (F1 = 0, EM = 0): {len(incorrect_predictions)}")


        # --- Display Samples for Manual Inspection ---

        sample_size = 15 # Number of samples to display for each category

        print(f"\n--- Sample of {sample_size} Partial Matches ---")
        if partial_matches:
            # Create a list of dictionaries with relevant info for display
            partial_display_data = []
            for item in partial_matches[:sample_size]:
                partial_display_data.append({
                    "QID": item["qid"],
                    "Question": item["question"],
                    "Context Snippet": item["context"][:150] + "..." if len(item["context"]) > 150 else item["context"],
                    "Ground Truth Answers": [gt["text"] for gt in item["ground_truth_answers"]],
                    "Prediction": item["prediction"],
                    "F1 Score": item["f1_score"],
                    "Status": item["correctness_status"]
                })
            # Display as a Pandas DataFrame for readability
            display(pd.DataFrame(partial_display_data))
        else:
            print("No partial matches to display.")

        print(f"\n--- Sample of {sample_size} Incorrect Predictions ---")
        if incorrect_predictions:
            # Create a list of dictionaries with relevant info for display
            incorrect_display_data = []
            for item in incorrect_predictions[:sample_size]:
                 incorrect_display_data.append({
                     "QID": item["qid"],
                     "Question": item["question"],
                     "Context Snippet": item["context"][:150] + "..." if len(item["context"]) > 150 else item["context"],
                     "Ground Truth Answers": [gt["text"] for gt in item["ground_truth_answers"]],
                     "Prediction": item["prediction"],
                     "F1 Score": item["f1_score"], # Should be 0 for incorrect
                     "Status": item["correctness_status"]
                 })
            # Display as a Pandas DataFrame for readability
            display(pd.DataFrame(incorrect_display_data))
        else:
            print("No incorrect predictions to display.")


        print("\n--- Instructions for Manual Categorization ---")
        print("Review the samples above (and if needed, inspect more examples from the 'incorrect_predictions' and 'partial_matches' lists directly).")
        print("Look for recurring themes in the questions, contexts, ground truth answers, and the model's predictions.")
        print("Common error categories in QA might include:")
        print(" - Predicting only part of the answer.")
        print(" - Predicting extra text not part of the answer.")
        print(" - Off-by-one errors (start/end index slightly wrong).")
        print(" - Misinterpreting negation or conditional statements.")
        print(" - Struggling with complex sentence structures.")
        print(" - Difficulty with specific types of entities (dates, numbers, legal terms).")
        print(" - Predicting plausible but incorrect answers.")
        print(" - Failure to answer when an answer exists (or predicting an answer when none exists).")
        print("\nBased on your review, identify the most prominent error types.")

    except Exception as e:
        print(f"An error occurred while preparing or displaying samples for error analysis: {e}")
        print("Sample display failed.")

# After manually reviewing the samples, you will categorize the errors (Manual Step 4 complete).
# Step 5 (Optional) is quantifying these categories, and Step 6 is summarizing findings.

--- Error Analysis - Inspecting Incorrect and Partial Predictions ---
Total examples analyzed: 4152
Number of Exact Matches: 1132
Number of Partial Matches (F1 > 0, EM = 0): 2514
Number of Incorrect Predictions (F1 = 0, EM = 0): 506

--- Sample of 15 Partial Matches ---


Unnamed: 0,QID,Question,Context Snippet,Ground Truth Answers,Prediction,F1 Score,Status
0,knyp7n1i9r35ci82,Does the company collect user's information di...,"New Orleans Tourism Marketing Corporation (""NO...",[access and use of the NOTMC Websites (as defi...,access and use of the NOTMC Websites (as defin...,0.764706,partial_match
1,6isrs6pl65f7ueuf,Will they use the data collected from me?,"New Orleans Tourism Marketing Corporation (""NO...",[our personal information collection practices],The Privacy Policy applies to your access and ...,0.064516,partial_match
2,qnabo06neuot52m1,Do you receive information from other sources?,"New Orleans Tourism Marketing Corporation (""NO...","[other online programs (""Online Services"")]",access and use of the NOTMC Websites (as defin...,0.384615,partial_match
3,5aas5r299xuwlzvx,Do they have provisions for audiences from cou...,NOTMC controls and operates its business and O...,[NOTMC controls and operates its business and ...,By submitting your personal information to us ...,0.666667,partial_match
4,9lazykcok4u5zm9a,What types of audiences the policy segment ref...,NOTMC controls and operates its business and O...,[NOTMC controls and operates its business and ...,By submitting your personal information to us ...,0.666667,partial_match
5,52rtzy27zze3ks7w,Do you notify users about policy changes?,"We may change our Privacy Policy, so please ch...","[We may change our Privacy Policy, so please c...",You can tell when the Privacy Policy was last ...,0.153846,partial_match
6,mj7f4m2r0onkqq2r,What do California residents notice?,You can jump to specific areas of our Privacy ...,[California Users'],California,0.666667,partial_match
7,aamvuhgdcero7qr9,Do they have provisions for audiences from cou...,You can jump to specific areas of our Privacy ...,[International],Children's Online Privacy International Contac...,0.285714,partial_match
8,2hpsrs4iw2flwg0h,Does the privacy policy mention anything about...,You can jump to specific areas of our Privacy ...,[Children's],Children's Online Privacy International Contac...,0.285714,partial_match
9,9wo0x1erkfe981ea,Do you provide choices to the users with regar...,Information We Collect Whether you access our ...,[Whether you access our Online Services from y...,you consent to providing it,0.181818,partial_match



--- Sample of 15 Incorrect Predictions ---


Unnamed: 0,QID,Question,Context Snippet,Ground Truth Answers,Prediction,F1 Score,Status
0,kyowf12dqvesnsrq,What scope does the user choice or control app...,Information We Collect Whether you access our ...,"[(for account administration, administration o...",you consent to providing it,0,incorrect
1,2380ylr0y3mnjp0n,Does the company collect user's information di...,Information We Collect Whether you access our ...,[Whether you access our Online Services from y...,NOTMC and its agents may collect,0,incorrect
2,ns2ozhnummwucojj,"Do you collect or use my information? If yes, ...",Information We Collect Whether you access our ...,"[name, mailing address, telephone number, e-ma...",geolocation,0,incorrect
3,ue9mlyeepok2nm5n,What type of information about me does the web...,Information We Collect Whether you access our ...,"[additional personal information, that identif...","device ID, including IP address",0,incorrect
4,vwkj3wo0bolsz46w,What types of user profile information does th...,Information We Collect Whether you access our ...,[user name and password],geolocation,0,incorrect
5,khfm33jif9e1eq4r,How do you collect my information for use?,"In addition, we may collect other types of dat...","[provided by you, information provided by you]",automatically,0,incorrect
6,uglo32dkucyzyrjq,Does the company collect user's information di...,"In addition, we may collect other types of dat...","[users of our Online Services, when users requ...",we may collect,0,incorrect
7,qge9m89pnqb6455p,"Will they collect my personal info? If yes, wh...",allow you to send communications or gifts (inc...,[(conditioned on your representation to us tha...,communications or gifts,0,incorrect
8,lc8gxnzhcwsov2fr,Do I have choice for my data collected or used...,To Perform Services You Request We may disclos...,"[services you request, such as fulfillment of ...",disclose,0,incorrect
9,0ftgtxs6cso575t1,Do other third-party companies see my exact lo...,To Perform Services You Request We may disclos...,[Personal Information],We may disclose,0,incorrect



--- Instructions for Manual Categorization ---
Review the samples above (and if needed, inspect more examples from the 'incorrect_predictions' and 'partial_matches' lists directly).
Look for recurring themes in the questions, contexts, ground truth answers, and the model's predictions.
Common error categories in QA might include:
 - Predicting only part of the answer.
 - Predicting extra text not part of the answer.
 - Off-by-one errors (start/end index slightly wrong).
 - Misinterpreting negation or conditional statements.
 - Struggling with complex sentence structures.
 - Difficulty with specific types of entities (dates, numbers, legal terms).
 - Predicting plausible but incorrect answers.
 - Failure to answer when an answer exists (or predicting an answer when none exists).

Based on your review, identify the most prominent error types.


In [None]:
import pandas as pd

# Assuming 'comparison_df' is available from cell f42b6067 and dd480c26
# This DataFrame should contain 'id', 'prediction_text', 'reference_answers',
# 'exact_match', 'f1', 'prediction_length', 'first_reference_length', and 'length_difference'

if 'comparison_df' in locals() and not comparison_df.empty:
    print("--- Quantifying Errors ---")

    # Filter for incorrect predictions (Exact Match = 0)
    incorrect_predictions_df = comparison_df[comparison_df['exact_match'] == 0].copy()
    incorrect_predictions_df['length_difference'] = incorrect_predictions_df['prediction_length'] - incorrect_predictions_df['first_reference_length']
    # --- Error Type 1: Answer Span Not Long Enough ---
    # We can use the 'length_difference' column (reference_length - prediction_length)
    # A positive length_difference indicates the prediction is shorter than the reference.

    # Quantify predictions that are shorter than the reference
    predictions_shorter_than_reference = incorrect_predictions_df[incorrect_predictions_df['length_difference'] > 0]
    num_predictions_shorter = len(predictions_shorter_than_reference)
    percentage_predictions_shorter = (num_predictions_shorter / len(incorrect_predictions_df)) * 100 if len(incorrect_predictions_df) > 0 else 0

    print(f"\nError Type: Predicted Answer Span Too Short")
    print(f"Number of incorrect predictions where the predicted answer is shorter than the first reference answer: {num_predictions_shorter}")
    print(f"Percentage of incorrect predictions where the predicted answer is shorter than the first reference answer: {percentage_predictions_shorter:.2f}%")

    # You can also look at the average length difference for these cases
    avg_length_difference_shorter = predictions_shorter_than_reference['length_difference'].mean() if num_predictions_shorter > 0 else 0
    print(f"Average length difference (Reference - Prediction) for these cases: {avg_length_difference_shorter:.2f} tokens")

    # Optionally, look at predictions that are significantly shorter (e.g., > 5 tokens shorter)
    min_length_difference_threshold = 5
    predictions_significantly_shorter = incorrect_predictions_df[incorrect_predictions_df['length_difference'] > min_length_difference_threshold]
    num_predictions_significantly_shorter = len(predictions_significantly_shorter)
    percentage_predictions_significantly_shorter = (num_predictions_significantly_shorter / len(incorrect_predictions_df)) * 100 if len(incorrect_predictions_df) > 0 else 0

    print(f"\nNumber of incorrect predictions where the predicted answer is more than {min_length_difference_threshold} tokens shorter than the first reference answer: {num_predictions_significantly_shorter}")
    print(f"Percentage of incorrect predictions where the predicted answer is more than {min_length_difference_threshold} tokens shorter than the first reference answer: {percentage_predictions_significantly_shorter:.2f}%")


    # --- Error Type 2: Dealing with Complex Sentence Structures ---
    # This is harder to quantify automatically without more sophisticated analysis of the text.
    # Based on manual inspection, we can discuss observations.

    print(f"\nError Type: Difficulty with Complex Sentence Structures")
    print("Quantifying this error type automatically is challenging and requires deeper linguistic analysis.")
    print("Based on manual inspection of incorrect and partial matches, common patterns observed might include:")
    print("- Misinterpreting questions with negation (e.g., 'Do you NOT share data with...').")
    print("- Struggling with questions requiring reasoning across multiple sentences or clauses within the context.")
    print("- Failing to identify the correct answer span in long, convoluted sentences.")
    print("- Incorrectly identifying the scope of conditions or exceptions mentioned in the text.")
    print("Further manual analysis of a larger sample of errors, potentially categorized by question type or sentence complexity metrics, would be needed for more precise quantification.")


else:
    print("The 'comparison_df' DataFrame is not available or is empty. Cannot perform error quantification.")

--- Quantifying Errors ---

Error Type: Predicted Answer Span Too Short
Number of incorrect predictions where the predicted answer is shorter than the first reference answer: 1282
Percentage of incorrect predictions where the predicted answer is shorter than the first reference answer: 48.23%
Average length difference (Reference - Prediction) for these cases: 6.98 tokens

Number of incorrect predictions where the predicted answer is more than 5 tokens shorter than the first reference answer: 608
Percentage of incorrect predictions where the predicted answer is more than 5 tokens shorter than the first reference answer: 22.87%

Error Type: Difficulty with Complex Sentence Structures
Quantifying this error type automatically is challenging and requires deeper linguistic analysis.
Based on manual inspection of incorrect and partial matches, common patterns observed might include:
- Misinterpreting questions with negation (e.g., 'Do you NOT share data with...').
- Struggling with questions

## Identify test set performance

### Subtask:
Extract the Exact Match and F1 scores from the output of the last test set evaluation (cell `b1051eab`).


## Load the best model

### Subtask:
Load the fine-tuned model from the saved path (`legal-bert-small-uncased-qa-final-best-user-params/final_model`).


In [None]:
import os
from transformers import AutoModelForQuestionAnswering

# Define the path where the final model with user's best params should be saved
final_model_output_dir = os.path.join(finetuning_path, 'legal-bert-small-uncased-qa-final-best-user-params')
final_model_path = os.path.join(final_model_output_dir, 'final_model') # Assuming it's saved in 'final_model' subdirectory

print(f"Attempting to load model from: {final_model_path}")

# Check if the model path exists before attempting to load
if not os.path.exists(final_model_path):
    print(f"Error: Model path not found at {final_model_path}. Please ensure the training cell (cell 1217ed22) ran successfully and saved the model to this location.")
    best_model = None # Set model to None if not found
else:
    try:
        # Load the fine-tuned model
        best_model = AutoModelForQuestionAnswering.from_pretrained(final_model_path)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"An error occurred during model loading: {e}")
        best_model = None # Set model to None if loading fails

Attempting to load model from: C:/Users/user/PLUE/PLUE-main/data\legal-bert-small-uncased-qa-final-best-user-params\final_model
Model loaded successfully.


## Prepare data for embedding

### Subtask:
Load the data you want to embed (e.g., the contexts from your dataset).


In [None]:
contexts = []
if 'dataset' in locals() and dataset is not None and 'test' in dataset and dataset['test'] is not None:
    original_test_dataset = dataset['test']
    print("Accessing original test dataset to extract contexts.")
    # The original dataset has a nested structure: dataset['split']['paragraphs'][i]['context']
    # We need to iterate through this structure to get all unique contexts.
    seen_contexts = set()
    for example in original_test_dataset:
        if "paragraphs" in example and isinstance(example["paragraphs"], list):
            for paragraph in example["paragraphs"]:
                if "context" in paragraph and isinstance(paragraph["context"], str):
                    context = paragraph["context"]
                    # Add context only if not already added to avoid duplicates
                    if context not in seen_contexts:
                        contexts.append(context)
                        seen_contexts.add(context)

    print(f"Extracted {len(contexts)} unique contexts from the test dataset.")
else:
    print("Error: Original dataset variable 'dataset' or 'dataset['test']' not found or is None. Cannot extract contexts.")

# The 'contexts' list now contains the unique contexts from the test set.
# This data is ready to be embedded.

Accessing original test dataset to extract contexts.
Extracted 497 unique contexts from the test dataset.


## Prepare Dev Data for Embedding

### Subtask:
Load the contexts from the dev split of your dataset.

In [None]:
contexts = []
if 'dataset' in locals() and dataset is not None and 'dev' in dataset and dataset['dev'] is not None:
    original_dev_dataset = dataset['dev']
    print("Accessing original dev dataset to extract contexts.")
    # The original dataset has a nested structure: dataset['split']['paragraphs'][i]['context']
    # We need to iterate through this structure to get all unique contexts.
    seen_contexts = set()
    for example in original_dev_dataset:
        if "paragraphs" in example and isinstance(example["paragraphs"], list):
            for paragraph in example["paragraphs"]:
                if "context" in paragraph and isinstance(paragraph["context"], str):
                    context = paragraph["context"]
                    # Add context only if not already added to avoid duplicates
                    if context not in seen_contexts:
                        contexts.append(context)
                        seen_contexts.add(context)

    print(f"Extracted {len(contexts)} unique contexts from the dev dataset.")
else:
    print("Error: Original dataset variable 'dataset' or 'dataset['dev']' not found or is None. Cannot extract contexts.")

# The 'contexts' list now contains the unique contexts from the dev set.
# This data is ready to be embedded.

Accessing original dev dataset to extract contexts.
Extracted 574 unique contexts from the dev dataset.


In [None]:
import torch

if 'best_model' in locals() and best_model is not None and 'tokenizer' in locals() and tokenizer is not None and 'contexts' in locals() and contexts:
    print(f"Generating embeddings for {len(contexts)} contexts using the fine-tuned model...")

    # Use the fine-tuned model directly for generating embeddings.
    # We will extract the last hidden states before the QA head.
    # Set the model to evaluation mode
    best_model.eval()

    embeddings = []
    # Process contexts in batches to manage memory
    batch_size = 16 # You can adjust this based on your GPU memory
    for i in tqdm(range(0, len(contexts), batch_size), desc="Generating Embeddings"):
        batch_contexts = contexts[i:i + batch_size]

        # Tokenize the batch of contexts
        encoded_input = tokenizer(batch_contexts, padding=True, truncation=True, return_tensors='pt', max_length=512) # Using max_length 512 as in training

        # Move tensors to the same device as the model
        device = best_model.device # Get the device the model is on
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

        # Generate outputs from the fine-tuned model without computing gradients
        with torch.no_grad():
            # Pass the encoded input through the model.
            # For AutoModelForQuestionAnswering, the output is typically a tuple or dictionary
            # containing logits, and potentially hidden states if output_hidden_states=True
            # We need the hidden states from the final layer of the transformer encoder.
            # Let's call the model directly.
            outputs = best_model(**encoded_input, output_hidden_states=True, return_dict=True)

            # Extract the last hidden states (output of the final transformer layer)
            # outputs.hidden_states is a tuple of hidden states for each layer + embedding layer
            # The last element is the output of the final layer
            last_hidden_states = outputs.hidden_states[-1]


        # Perform pooling (e.g., mean pooling) to get sentence embeddings from the last hidden states
        # Mean pooling: average token embeddings, masking out padding tokens
        input_mask = encoded_input['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * input_mask, 1)
        sum_mask = torch.clamp(input_mask.sum(1), min=1e-9)
        mean_pooled_embeddings = sum_embeddings / sum_mask

        embeddings.extend(mean_pooled_embeddings.cpu().numpy()) # Move to CPU and convert to numpy

    print(f"Generated {len(embeddings)} embeddings, each with dimension {embeddings[0].shape[0]}.")

    # The 'embeddings' list now contains numpy arrays representing the embeddings for each context.
    # You can store these embeddings and their corresponding contexts for the RAG system.

elif 'best_model' not in locals() or best_model is None:
    print("Error: The 'best_model' is not loaded. Please ensure cell ef60067e ran successfully.")
elif 'tokenizer' not in locals() or tokenizer is None:
    print("Error: The 'tokenizer' is not available. Please ensure it was loaded in a previous cell.")
elif 'contexts' not in locals() or not contexts:
     print("Error: No contexts available for embedding. Please ensure cell 349fa9e6 ran successfully and extracted contexts.")

Generating embeddings for 574 contexts using the fine-tuned model...


Generating Embeddings:   0%|          | 0/36 [00:00<?, ?it/s]

Generated 574 embeddings, each with dimension 512.


#### Step 1: Set up FAISS Index

We will use the `faiss-cpu` library to create an in-memory vector index from the context embeddings.

In [None]:
# Install faiss-cpu if you haven't already
%pip install faiss-cpu

import faiss
import numpy as np

# Assuming 'embeddings' are available from cell 715db90c
# Assuming 'contexts' are available from cell 349fa9e6

if 'embeddings' in locals() and embeddings is not None and len(embeddings) > 0 and 'contexts' in locals() and contexts is not None:
    # Ensure embeddings are a numpy array and in the correct format (float32)
    embeddings_np = np.array(embeddings).astype('float32')
    dimension = embeddings_np.shape[1] # Dimension of the embeddings

    print(f"\nCreating FAISS index with {len(embeddings_np)} vectors of dimension {dimension}...")

    # Create a FAISS index
    # Using IndexFlatL2 for L2 distance (Euclidean distance) similarity search
    index = faiss.IndexFlatL2(dimension)

    # Add the embeddings to the index
    index.add(embeddings_np)

    print(f"FAISS index created and populated with {index.ntotal} vectors.")
    # The 'index' object now contains the indexed context embeddings.

elif 'embeddings' not in locals() or embeddings is None or len(embeddings) == 0:
    print("Error: Context embeddings are not available or are empty. Please ensure cell 715db90c ran successfully.")
elif 'contexts' not in locals() or contexts is None:
    print("Error: Contexts list is not available. Please ensure cell 349fa9e6 ran successfully.")

Note: you may need to restart the kernel to use updated packages.

Creating FAISS index with 574 vectors of dimension 512...
FAISS index created and populated with 574 vectors.


#### Step 3: Perform Similarity Search (Regenerated)

Use the embedded user question to query the FAISS index and find the indices of the most similar context embeddings.

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Assuming 'best_model' and 'tokenizer' are loaded from previous cells (e.g., ef60067e)
# Assuming 'index' is the FAISS index created in cell c9e1dc6d
# Assuming 'contexts' are available from a previous cell (e.g., 349fa9e6)

if 'best_model' in locals() and best_model is not None and 'tokenizer' in locals() and tokenizer is not None and 'index' in locals() and index is not None and 'contexts' in locals() and contexts:
    user_question = "What data do you collect about me?" # Example user question - can be replaced with actual user input

    print(f"\nEmbedding the user question for similarity search: '{user_question}'...")

    # Ensure model is in evaluation mode
    best_model.eval()

    # Use the fine-tuned model directly for question embedding.
    # We will extract the last hidden states and perform mean pooling.

    # Tokenize the question
    encoded_question = tokenizer(user_question, padding=True, truncation=True, return_tensors='pt', max_length=512).to(best_model.device) # Move to device


    # Generate embedding
    with torch.no_grad():
        # Pass the encoded input through the model.
        # We need the hidden states from the final layer of the transformer encoder.
        outputs = best_model(**encoded_question, output_hidden_states=True, return_dict=True)

        # Extract the last hidden states (output of the final transformer layer)
        last_hidden_states = outputs.hidden_states[-1]

        # Perform pooling (e.g., mean pooling) to get sentence embeddings
        # Mean pooling: average token embeddings, masking out padding tokens
        input_mask = encoded_question['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * input_mask, 1)
        sum_mask = torch.clamp(input_mask.sum(1), min=1e-9)
        question_embedding = (sum_embeddings / sum_mask).cpu().numpy() # Get numpy array


    print("User question embedded.")

    # Perform similarity search
    k = 5 # Number of top contexts to retrieve
    print(f"\nSearching for the top {k} most similar contexts...")

    # Reshape query embedding for FAISS (needs to be a 2D array)
    question_embedding = question_embedding.reshape(1, -1)

    # Perform the search
    distances, indices = index.search(question_embedding, k)

    print("Search complete. Retrieved indices:")
    print(indices)

    # Retrieve the actual contexts based on the indices
    retrieved_contexts = [contexts[i] for i in indices[0]] # indices is a 2D array, take the first row


    print("\nRetrieved Contexts:")
    for i, context in enumerate(retrieved_contexts):
        print(f"--- Context {i+1} ---")
        print(context[:500] + "...") # Print first 500 characters as a snippet
        print("-" * 20)

elif 'best_model' not in locals() or best_model is None:
    print("Error: The 'best_model' is not loaded. Cannot embed the question or perform search.")
elif 'tokenizer' not in locals() or tokenizer is None:
    print("Error: The 'tokenizer' is not available. Cannot embed the question or perform search.")
elif 'index' not in locals() or index is None:
     print("Error: The FAISS 'index' is not available. Please ensure cell c9e1dc6d ran successfully.")
elif 'contexts' not in locals() or not contexts:
     print("Error: No contexts available. Please ensure cell 349fa9e6 ran successfully.")


Embedding the user question for similarity search: 'What data do you collect about me?'...
User question embedded.

Searching for the top 5 most similar contexts...
Search complete. Retrieved indices:
[[384 112  32 312 296]]

Retrieved Contexts:
--- Context 1 ---
We may share information about you with Internet Brands' subsidiaries and affiliates....
--------------------
--- Context 2 ---
Last Updated: September 25, 2014...
--------------------
--- Context 3 ---
We are TRUSTe certified....
--------------------
--- Context 4 ---
Last Updated: September 1, 2013...
--------------------
--- Context 5 ---
Last Updated: December 1st, 2014...
--------------------


#### Step 4: Implement Generation with GPT-4.1-nano

Use the retrieved contexts and the user's question to generate an answer using OpenAI's `gpt-4.1-nano-2025-04-14`.

**Reasoning**:
Format the retrieved contexts and the user question into a prompt and use the `openai` library to interact with the `gpt-4.1-nano-2025-04-14` model to generate a response.

In [None]:
# Install the openai library if you haven't already

import os
from openai import OpenAI

# Assuming 'retrieved_contexts' are available from cell 0c96c6f8
# Assuming 'user_question' is available from cell 0c96c6f8

# Access your OpenAI API key securely from Colab secrets
# You need to add your OpenAI API key to Colab secrets named 'OPENAI_API_KEY'
openai_api_key = ''

if openai_api_key is None:
    print("Error: OpenAI API key not found in Colab secrets. Please add it as 'OPENAI_API_KEY'.")
elif 'retrieved_contexts' not in locals() or not retrieved_contexts:
    print("Error: Retrieved contexts are not available. Please ensure cell 0c96c6f8 ran successfully.")
elif 'user_question' not in locals() or not user_question:
    print("Error: User question is not available. Please ensure cell 0c96c6f8 ran successfully.")
else:
    try:
        # Initialize the OpenAI client
        client = OpenAI(api_key=openai_api_key)

        # Combine the retrieved contexts into a single string
        contexts_text = "\n---\n".join(retrieved_contexts)

        # Create the prompt for the language model
        # Instruct the model to answer based *only* on the provided contexts
        prompt = f"""Answer the following question based only on the provided context. If you cannot find the answer in the context, say "I cannot find the answer in the provided information."

Context:
{contexts_text}

Question:
{user_question}

Answer:
"""

        print("Sending prompt to GPT-4.1-nano...")
        # Call the OpenAI API
        response = client.chat.completions.create(
            model="gpt-4.1-nano-2025-04-14", # Specify the model
            messages=[
                {"role": "system", "content": "You are an expert in legal domain that answers questions based on provided context."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0, # Use a low temperature for more deterministic answers based on context
        )

        # Extract and display the generated answer
        generated_answer = response.choices[0].message.content

        print("\n--- Generated Answer ---")
        print(generated_answer)
        print("-" * 20)

    except Exception as e:
        print(f"An error occurred during the OpenAI API call: {e}")
        print("Please check your API key, model name, and network connection.")

Sending prompt to GPT-4.1-nano...

--- Generated Answer ---
I cannot find the answer in the provided information.
--------------------


# Task
Evaluate the RAG system with the fine-tuned legal-bert-small embedding model, FAISS index, and gpt-4.1-nano-2025-04-14 generative model on the dev set, then explore fine-tuning the generative model, and finally evaluate the system on the test set.

## Evaluate rag on dev set

### Subtask:
Iterate through the questions in the dev set. For each question, perform retrieval using the fine-tuned Legal-BERT model and the FAISS index to get relevant contexts. Use these contexts and the question to prompt the GPT-4.1-nano model and generate an answer. Collect the generated answers.


In [None]:
# Assuming dataset is loaded from a previous cell, if not, load it.
# Example: dataset = load_from_disk('/path/to/your/dataset')

if 'dataset' not in locals() or dataset is None or 'dev' not in dataset or dataset['dev'] is None:
    print("Error: The 'dataset' variable (containing the dev set) is not available or correctly loaded.")
    original_dev_dataset = None
else:
    original_dev_dataset = dataset['dev']
    print("Loaded original dev dataset for RAG evaluation.")

# List to store generated answers and references for evaluation
rag_predictions_and_references = []

Loaded original dev dataset for RAG evaluation.


**Reasoning**:
Iterate through the dev dataset, extract questions, embed them, retrieve contexts using the FAISS index, generate answers using GPT-4.1-nano, and store the results.



## Evaluate RAG on Dev Set

### Subtask:
Iterate through the questions in the dev set. For each question, perform retrieval using the fine-tuned Legal-BERT model and the FAISS index to get relevant contexts. Use these contexts and the question to prompt the GPT-4.1-nano model and generate an answer. Collect the generated answers.

In [None]:
# Assuming dataset is loaded from a previous cell, if not, load it.
# Example: dataset = load_from_disk('/path/to/your/dataset')

if 'dataset' not in locals() or dataset is None or 'dev' not in dataset or dataset['dev'] is None:
    print("Error: The 'dataset' variable (containing the dev set) is not available or correctly loaded.")
    original_dev_dataset = None
else:
    original_dev_dataset = dataset['dev']
    print("Loaded original dev dataset for RAG evaluation.")

# List to store generated answers and references for evaluation
rag_predictions_and_references = []

Loaded original dev dataset for RAG evaluation.


In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import faiss
from openai import OpenAI
import collections # For handling original dataset structure
from tqdm.auto import tqdm
import time # To add delays between API calls if needed

# Ensure necessary variables are available from previous cells:
# - best_model (loaded fine-tuned Legal-BERT)
# - tokenizer (loaded tokenizer)
# - index (FAISS index created from dev contexts)
# - contexts (list of dev contexts used to build the index)
# - original_dev_dataset (loaded original dev dataset)
# - openai_api_key (from Colab secrets or defined directly)


if ('best_model' not in locals() or best_model is None or
    'tokenizer' not in locals() or tokenizer is None or
    'index' not in locals() or index is None or
    'contexts' not in locals() or not contexts or
    'original_dev_dataset' not in locals() or original_dev_dataset is None):
    print("Error: Required variables for RAG evaluation are not available.")
    print("Please ensure the following cells have run successfully:")
    print("- Cell loading the best_model and tokenizer (e.g., ef60067e)")
    print("- Cell generating context embeddings (e.g., 715db90c)")
    print("- Cell creating the FAISS index (e.g., c9e1dc6d)")
    print("- Cell preparing dev contexts (e.g., 349fa9e6)")
    print("- Cell loading original dev dataset (e.g., 7ca21051)")

else:
    print("Starting RAG evaluation on the development set...")

    # Ensure OpenAI API key is available
    # In a non-Colab environment, load this from environment variables or a config file
    if 'openai_api_key' not in locals():
        # Attempt to get from userdata if not already defined (e.g., if cell 2f8a88f2 wasn't run)
        try:
            openai_api_key = ''
        except Exception:
            openai_api_key = os.environ.get('OPENAI_API_KEY') # Try environment variable

    if openai_api_key is None:
        print("Error: OpenAI API key not found. Please define 'openai_api_key' or add it to Colab secrets.")
    else:
        try:
            # Initialize the OpenAI client
            client = OpenAI(api_key=openai_api_key)
            print("OpenAI client initialized.")

            # List to store generated answers and references for evaluation
            rag_predictions_and_references = []

            # Ensure model is in evaluation mode
            best_model.eval()

            # Access the base encoder for question embedding
            if hasattr(best_model, 'bert'):
                base_encoder = best_model.bert
            elif hasattr(best_model, 'base_model'):
                base_encoder = best_model.base_model
            else:
                base_encoder = list(best_model.children())[0] # Fallback


            if base_encoder is None:
                 print("Error: Could not access the base encoder for question embedding. Cannot proceed with evaluation.")
            else:
                # Iterate through each example in the original dev dataset
                # The structure is examples['paragraphs'][i]['qas'][j]
                print(f"Processing {len(original_dev_dataset)} examples in the dev set...")
                # We need to iterate through the nested structure to get each individual question and its ID/answers
                all_qas = []
                for example in original_dev_dataset:
                    if "paragraphs" in example and isinstance(example["paragraphs"], list):
                        for paragraph in example["paragraphs"]:
                            if "qas" in paragraph and isinstance(paragraph["qas"], list):
                                for qa in paragraph["qas"]:
                                    if "id" in qa and "question" in qa and "answers" in qa:
                                        all_qas.append({
                                            "id": qa["id"],
                                            "question": qa["question"],
                                            "answers": qa["answers"] # Ground truth answers
                                        })
                                    else:
                                        print(f"Warning: Skipping QA due to missing 'id', 'question', or 'answers'.")
                            else:
                                print(f"Warning: Skipping paragraph due to missing or invalid 'qas' list.")
                    else:
                        print(f"Warning: Skipping example due to missing or invalid 'paragraphs' list.")


                print(f"Extracted {len(all_qas)} individual questions from the dev set for RAG evaluation.")


                # Process questions in batches for embedding
                question_batch_size = 32 # Adjust based on memory
                retrieval_k = 5 # Number of contexts to retrieve for each question
                llm_model_name = "gpt-4.1-nano-2025-04-14" # Specify the LLM model name


                # Process questions in batches
                for i in tqdm(range(0, len(all_qas), question_batch_size), desc="Evaluating RAG on Dev Set"):
                    batch_qas = all_qas[i:i + question_batch_size]
                    batch_questions = [qa["question"] for qa in batch_qas]
                    batch_qids = [qa["id"] for qa in batch_qas]
                    batch_ground_truth_answers = [qa["answers"] for qa in batch_qas]


                    # Embed the batch of questions
                    encoded_questions = tokenizer(batch_questions, padding=True, truncation=True, return_tensors='pt', max_length=512).to(best_model.device)

                    with torch.no_grad():
                        encoder_outputs = base_encoder(**encoded_questions, return_dict=True)
                        last_hidden_states = encoder_outputs.last_hidden_state

                        input_mask = encoded_questions['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
                        sum_embeddings = torch.sum(last_hidden_states * input_mask, 1)
                        sum_mask = torch.clamp(input_mask.sum(1), min=1e-9)
                        question_embeddings = (sum_embeddings / sum_mask).cpu().numpy() # Get numpy array


                    # Perform similarity search for each question in the batch
                    batch_retrieved_contexts = []
                    distances, indices = index.search(question_embeddings, retrieval_k) # Search for the batch

                    for j in range(len(batch_qas)):
                        # Retrieve the actual contexts based on the indices for this question
                        retrieved_context_indices = indices[j]
                        retrieved_context_list = [contexts[idx] for idx in retrieved_context_indices]
                        batch_retrieved_contexts.append(retrieved_context_list)


                    # Use the generative model to answer each question based on retrieved contexts
                    # Process questions one by one for LLM call as batching might be different or not supported by API
                    for j in range(len(batch_qas)):
                        qid = batch_qids[j]
                        question = batch_questions[j]
                        ground_truth_answers = batch_ground_truth_answers[j]
                        retrieved_context_for_question = batch_retrieved_contexts[j]

                        # Combine retrieved contexts into a single string for the prompt
                        contexts_text = "\n---\n".join(retrieved_context_for_question)

                        # Create the prompt
                        prompt = f"""Answer the following question based only on the provided context. If you cannot find the answer in the context, say "I cannot find the answer in the provided information."

Context:
{contexts_text}

Question:
{question}

Answer:
"""
                        generated_answer = "Error: API call failed" # Default in case of API error

                        try:
                            # Call the OpenAI API
                            response = client.chat.completions.create(
                                model=llm_model_name, # Specify the model
                                messages=[
                                    {"role": "system", "content": "You are a legal expert/judge that answers questions based on provided context."},
                                    {"role": "user", "content": prompt}
                                ],
                                temperature=0.0,
                            )
                            generated_answer = response.choices[0].message.content

                        except Exception as e:
                            print(f"Error calling OpenAI API for QID {qid}: {e}")
                            # Continue with default error message


                        # Store the generated answer and ground truth for this question
                        rag_predictions_and_references.append({
                            "id": qid,
                            "prediction_text": generated_answer,
                            "references": ground_truth_answers # Store the list of ground truth answer dicts
                        })

                        # Optional: Add a small delay to avoid hitting API rate limits
                        # time.sleep(0.1) # Adjust delay as needed


                print(f"Processed batch {i // question_batch_size + 1}. Collected {len(rag_predictions_and_references)} results so far.")


            print("\nFinished RAG evaluation on the development set.")
            print(f"Collected generated answers for {len(rag_predictions_and_references)} questions.")

            # The 'rag_predictions_and_references' list now contains the results needed for evaluation.

        except Exception as e:
            print(f"An error occurred during the RAG evaluation loop: {e}")
            print("RAG evaluation on dev set failed.")

Starting RAG evaluation on the development set...
OpenAI client initialized.
Processing 20 examples in the dev set...
Extracted 3809 individual questions from the dev set for RAG evaluation.


Evaluating RAG on Dev Set:   0%|          | 0/120 [00:00<?, ?it/s]

Processed batch 120. Collected 3809 results so far.

Finished RAG evaluation on the development set.
Collected generated answers for 3809 questions.


## Evaluate Generation Quality on Dev Set

### Subtask:
Compare the generated answers from the RAG system on the dev set against the ground truth answers using F1 and Exact Match metrics.

In [None]:
import evaluate
import collections # Needed for normalize_answer function if not already imported
import string    # Needed for normalize_answer function if not already imported
import re        # Needed for normalize_answer function if not already imported
from tqdm.auto import tqdm # Needed for progress bar

# Assuming 'rag_predictions_and_references' is available from cell 15c5cec4

if 'rag_predictions_and_references' not in locals() or not isinstance(rag_predictions_and_references, list) or not rag_predictions_and_references:
    print("Error: 'rag_predictions_and_references' is not available or is empty.")
    print("Please ensure cell 15c5cec4 ran successfully and collected the results.")
else:
    print("Evaluating generated answers using SQuAD metrics (Exact Match and F1)...")

    try:
        # We need to prepare the predictions and references in the format expected by evaluate.load("squad")
        # Predictions should be a list of dictionaries like [{"id": "qid", "prediction_text": "answer"}]
        # References should be a list of dictionaries like [{"id": "qid", "answers": {"text": ["ans1", "ans2"], "answer_start": [start1, start2]}}]

        formatted_predictions = []
        formatted_references = []

        # Create a dictionary for quick lookup of references by ID,
        # as evaluate.compute expects parallel lists of predictions and references with matching IDs.
        # Although the input list is already structured by question, creating a dict is safer
        # if the evaluation library's compute function reorders internally or expects random access.
        references_by_id = {}
        for item in rag_predictions_and_references:
             # Each item has 'id', 'prediction_text', and 'references' (which is the list of ground truth answer dicts)
             qid = item['id']
             # Format ground truth answers for the reference structure
             ground_truth_answer_texts = [ans.get("text", "") for ans in item['references'] if isinstance(ans, dict)]
             ground_truth_answer_starts = [ans.get("answer_start", -1) for ans in item['references'] if isinstance(ans, dict)]

             # Store the reference
             references_by_id[qid] = {
                 "id": qid,
                 "answers": {
                     "text": ground_truth_answer_texts,
                     "answer_start": ground_truth_answer_starts
                 }
             }

             # Add the prediction to the formatted list
             formatted_predictions.append({
                 "id": qid,
                 "prediction_text": item['prediction_text']
             })

        # Now, create the formatted_references list ensuring the order matches formatted_predictions
        # Iterate through formatted_predictions to maintain order
        for pred in formatted_predictions:
             qid = pred['id']
             if qid in references_by_id:
                  formatted_references.append(references_by_id[qid])
             else:
                  print(f"Warning: No reference found for prediction ID {qid}. Skipping.")
                  # Optionally, remove the prediction if no reference exists
                  # formatted_predictions.remove(pred) # This modifies list while iterating, not safe

        # A safer approach if order needs to be strictly matched is to build both lists in the same loop:
        # Rebuild both lists from the collected results
        formatted_predictions_rebuilt = []
        formatted_references_rebuilt = []
        for item in rag_predictions_and_references:
            qid = item['id']
            # Ensure ground truth answers are in the correct format
            ground_truth_answer_texts = [ans.get("text", "") for ans in item['references'] if isinstance(ans, dict)]
            ground_truth_answer_starts = [ans.get("answer_start", -1) for ans in item['references'] if isinstance(ans, dict)]

            formatted_predictions_rebuilt.append({
                "id": qid,
                "prediction_text": item['prediction_text']
            })
            formatted_references_rebuilt.append({
                "id": qid,
                "answers": {
                    "text": ground_truth_answer_texts,
                    "answer_start": ground_truth_answer_starts
                }
            })

        formatted_predictions = formatted_predictions_rebuilt
        formatted_references = formatted_references_rebuilt


        # Load the SQuAD evaluation metric
        metric = evaluate.load("squad")

        # Compute the metrics
        # Ensure both lists have the same length, although the 'squad' metric handles mismatched IDs
        if len(formatted_predictions) != len(formatted_references):
             print(f"Warning: Mismatch in number of predictions ({len(formatted_predictions)}) and references ({len(formatted_references)}).")


        # Compute metrics, handling potential issues with empty lists
        if formatted_predictions and formatted_references:
            metrics = metric.compute(predictions=formatted_predictions, references=formatted_references)

            print("\n--- RAG System Evaluation Results on Dev Set ---")
            print(f"Exact Match: {metrics['exact_match']:.2f}")
            print(f"F1 Score: {metrics['f1']:.2f}")
            print("-" * 50)
        else:
            print("\nNo valid predictions or references available for metric computation.")


    except Exception as e:
        print(f"An error occurred during metric computation: {e}")
        print("Evaluation on Dev Set failed.")

# The RAG system's performance on the Dev Set is now evaluated (Step 2 complete).
# The next steps involve exploring fine-tuning the generative model and evaluating on the test set.

Evaluating generated answers using SQuAD metrics (Exact Match and F1)...

--- RAG System Evaluation Results on Dev Set ---
Exact Match: 0.03
F1 Score: 6.89
--------------------------------------------------


## Summary of Research Process

This document details a series of experiments conducted to fine-tune and evaluate a Legal-BERT small uncased model for the PolicyQA question answering task, followed by its integration into a Retrieval Augmented Generation (RAG) system. The methodology employed involved iterative hyperparameter tuning and rigorous evaluation to identify optimal configurations and analyze model performance.

**1. Data Preparation:**
The process began with loading and preparing the PolicyQA dataset. This involved tokenizing the dataset and structuring it into a format suitable for training a Question Answering model, including the identification of answer spans within the provided contexts.

**2. Baseline Model Fine-tuning and Evaluation:**
An initial fine-tuning of the `nlpaueb/legal-bert-small-uncased` model was performed on the training split of the PolicyQA dataset. The model was evaluated on the development set to establish a baseline performance in terms of Exact Match (EM) and F1 score. This initial run utilized default or commonly used hyperparameters.

**3. Hyperparameter Tuning Experiments:**
A series of experiments were conducted to investigate the impact of key hyperparameters on the fine-tuned model's performance. For each experiment, the model was trained and evaluated multiple times (typically 3 runs) with different values for a specific hyperparameter, while keeping others fixed to the best values found in preceding experiments or to a sensible baseline. The average EM and F1 scores were calculated across runs for each hyperparameter value to mitigate the impact of random initialization.

The hyperparameters explored included:
- **Learning Rate:** Different learning rates (e.g., 1e-5, 3e-5, 5e-5) were tested to find the optimal rate for convergence and performance.
- **Number of Training Epochs:** The impact of training duration (e.g., 2, 5, 10 epochs) on mitigating underfitting and overfitting was investigated.
- **Maximum Sequence Length and Stride:** The configuration of the tokenizer's `max_length` and `stride` parameters was explored to understand their effect on handling long contexts and answer spans that might cross tokenization boundaries.
- **Batch Size:** Different training and evaluation batch sizes were tested to assess their influence on training efficiency and model performance.
- **Weight Decay:** Various weight decay values were examined to determine their effectiveness in regularizing the model and preventing overfitting.

**4. Post-processing Parameter Tuning:**
Beyond model training hyperparameters, the influence of post-processing parameters (`max_answer_length` and `n_best_size`) on the final answer extraction and evaluation metrics was investigated. Different combinations of these parameters were applied to the raw logits predicted by the model to identify the configuration yielding the best results on the development set.

**5. Identification of Best Model and Configuration:**
Based on the results of the hyperparameter tuning experiments, the model configuration (including training hyperparameters and post-processing parameters) that achieved the highest average F1 score on the development set was identified as the "best" performing configuration.

**6. Training and Evaluation of Final Model:**
A new instance of the Legal-BERT model was trained from scratch using the identified "best" training hyperparameters (Learning Rate, Epochs, Max Length, Stride, Batch Size, Weight Decay). This final fine-tuned model was then explicitly saved to a designated path. The performance of this final model was evaluated on both the development set and the unseen test set using the identified "best" post-processing parameters to obtain its final performance metrics.

**7. Error Analysis:**
A detailed error analysis was conducted on the predictions of the final model on the test set. This involved matching model predictions to ground truth answers, identifying incorrect and partial matches, and qualitatively and quantitatively analyzing common error patterns. Specific focus was placed on understanding instances where the model predicted answer spans that were significantly shorter than the ground truth.

**8. Retrieval Augmented Generation (RAG) System Implementation:**
The fine-tuned Legal-BERT model was integrated into a RAG system. This involved:
- Generating embeddings for the policy contexts in the dataset using the fine-tuned Legal-BERT model's encoder.
- Building an efficient vector index (using FAISS) from these context embeddings for fast similarity search.
- Implementing a retrieval mechanism to find the most relevant contexts for a given user question by embedding the question and querying the FAISS index.
- Utilizing a large language model (specifically, OpenAI's `gpt-4.1-nano-2025-04-14`) to generate a final answer based on the user's question and the retrieved contexts.

**9. RAG System Evaluation:**
The implemented RAG system was evaluated on the development set by iterating through dev questions, performing retrieval, and generating answers using GPT-4.1-nano. The generated answers were then compared against the ground truth answers using SQuAD metrics to assess the overall performance of the RAG pipeline.

This systematic approach, encompassing fine-tuning, hyperparameter optimization, error analysis, and RAG system integration, provides a comprehensive understanding of the Legal-BERT model's capabilities for PolicyQA and the performance of a RAG system built upon it.