In [None]:
import numpy as np

In [None]:
import openai
gpt_qa_model = "gpt-4.1-nano-2025-04-14"
openai.api_key =' '##API KEY


In [None]:
from datasets import load_dataset

train_dataset = load_dataset('csv', data_files='PLUE/PLUE-main/data/privacyqa/policy_train_data.csv', delimiter='\t')
test_dataset = load_dataset('csv', data_files='PLUE/PLUE-main/data/privacyqa/policy_test_data.csv', delimiter='\t')

# Split the training data into training and validation sets
train_dataset = train_dataset["train"].train_test_split(test_size=0.1)
validation_dataset = train_dataset["test"]
train_dataset = train_dataset["train"]

print("Training dataset:", train_dataset)
print("Validation dataset:", validation_dataset)
print("Test dataset:", test_dataset)

# Filter relevant and irrelevant examples from the training dataset
relevant_examples = [example for example in train_dataset if example["Label"] == "Relevant"]
irrelevant_examples = [example for example in train_dataset if example["Label"] == "Irrelevant"]

print("\nNumber of relevant examples in training data:", len(relevant_examples))
print("Number of irrelevant examples in training data:", len(irrelevant_examples))

Training dataset: Dataset({
    features: ['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Label'],
    num_rows: 166680
})
Validation dataset: Dataset({
    features: ['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Label'],
    num_rows: 18520
})
Test dataset: DatasetDict({
    train: Dataset({
        features: ['Folder', 'DocID', 'QueryID', 'SentID', 'Split', 'Query', 'Segment', 'Any_Relevant', 'Ann1', 'Ann2', 'Ann3', 'Ann4', 'Ann5', 'Ann6'],
        num_rows: 62150
    })
})

Number of relevant examples in training data: 6434
Number of irrelevant examples in training data: 160246


# Task
Set up and run a 1, 2, and 3-shot learning experiment using a loaded model on a test set, including information about class imbalance in the prompts, and evaluate the performance for each shot number.

In [None]:
# Randomly select 3 "Relevant" examples.

num_relevant_examples = len(relevant_examples)
# Ensure there are enough relevant examples to sample from
if num_relevant_examples < 3:
    print(f"Warning: Only {num_relevant_examples} relevant examples available. Selecting all available relevant examples.")
    relevant_indices = np.arange(num_relevant_examples).tolist()
else:
    relevant_indices = np.random.choice(num_relevant_examples, size=3, replace=False).tolist() # Convert numpy array to list
selected_relevant_examples = [
    {"Segment": relevant_examples[i]["Segment"], "Label": relevant_examples[i]["Label"]}
    for i in relevant_indices
]

# Randomly select 3 "Irrelevant" examples.
num_irrelevant_examples = len(irrelevant_examples)
# Ensure there are enough irrelevant examples to sample from
if num_irrelevant_examples < 3:
    print(f"Warning: Only {num_irrelevant_examples} irrelevant examples available. Selecting all available irrelevant examples.")
    irrelevant_indices = np.arange(num_irrelevant_examples).tolist()
else:
    irrelevant_indices = np.random.choice(num_irrelevant_examples, size=3, replace=False).tolist() # Convert numpy array to list
selected_irrelevant_examples = [
    {"Segment": irrelevant_examples[i]["Segment"], "Label": irrelevant_examples[i]["Label"]}
    for i in irrelevant_indices
]

# Store the selected examples as a list of dictionaries.
few_shot_examples = {
    "Relevant": selected_relevant_examples,
    "Irrelevant": selected_irrelevant_examples,
}

print("Selected few-shot examples:")
print(few_shot_examples)


Selected few-shot examples:
{'Relevant': [{'Segment': 'Reddit allows other websites to embed public Reddit content via our embed tools.', 'Label': 'Relevant'}, {'Segment': 'We will learn and collect your own wording from your input text messages (SMS or MMS), to help us provide you a faster and more precise prediction (including the popup smiley prediction).', 'Label': 'Relevant'}, {'Segment': 'Deleting Your Account', 'Label': 'Relevant'}], 'Irrelevant': [{'Segment': 'In addition, we are required to list the data access right available to all users in a specific way.', 'Label': 'Irrelevant'}, {'Segment': 'Provide the Service: Your information will be used primarily to provide you with the Service.', 'Label': 'Irrelevant'}, {'Segment': 'maintain appropriate records for internal administrative purposes;', 'Label': 'Irrelevant'}]}


## Construct prompts for few-shot learning

### Subtask:
Create prompts for each test data point, including the selected few-shot examples and the test sentence. Add a note about the class imbalance in the prompt.


In [None]:
def construct_few_shot_prompt(example, few_shot_examples, num_shots, system_instruction=None):
    """
    Constructs a few-shot prompt for a given example.

    Args:
        example (dict): The test example to classify.
        few_shot_examples (dict): A dictionary containing relevant and irrelevant few-shot examples.
        num_shots (int): The number of few-shot examples to include in the prompt.
        system_instruction (str, optional): An instruction to prepend to the prompt. Defaults to None.

    Returns:
        str: The constructed prompt string.
    """
    prompt = ""
    if system_instruction:
        prompt += f"{system_instruction}\n\n"

    # Add few-shot examples
    # Include an equal number of relevant and irrelevant examples if possible
    # If num_shots is odd, include one more of one class (e.g., relevant)
    num_relevant_to_include = num_shots // 2 + (num_shots % 2)
    num_irrelevant_to_include = num_shots // 2


    # Ensure we don't request more examples than available
    num_relevant_to_include = min(num_relevant_to_include, len(few_shot_examples.get("Relevant", [])))
    num_irrelevant_to_include = min(num_irrelevant_to_include, len(few_shot_examples.get("Irrelevant", [])))

    # Add relevant examples
    for i in range(num_relevant_to_include):
        relevant_example = few_shot_examples["Relevant"][i]
        prompt += f"Sentence: {relevant_example['Segment']}\nLabel: {relevant_example['Label']}\n\n"

    # Add irrelevant examples
    for i in range(num_irrelevant_to_include):
         irrelevant_example = few_shot_examples["Irrelevant"][i]
         prompt += f"Sentence: {irrelevant_example['Segment']}\nLabel: {irrelevant_example['Label']}\n\n"


    # Add note about class imbalance
    prompt += "Note: The dataset has a significant class imbalance, with many more 'Irrelevant' examples than 'Relevant' ones.\n\n"


    # Add the test sentence
    prompt += f"Sentence: {example['Segment']}\nLabel:"

    return prompt

# Example usage with 1-shot, 2-shot, and 3-shot
system_instruction = "Act as a legal expert and classify the following sentences as 'Relevant' or 'Irrelevant' to privacy policies."

# Assuming 'test_dataset' is your test dataset loaded
# and 'few_shot_examples' is your dictionary of selected few-shot examples

# Create prompts for the first test example for demonstration
# Access the 'train' split of the test_dataset and get the first example
first_test_example = test_dataset['train'][0]

prompt_1_shot = construct_few_shot_prompt(first_test_example, few_shot_examples, num_shots=1, system_instruction=system_instruction)
prompt_2_shots = construct_few_shot_prompt(first_test_example, few_shot_examples, num_shots=2, system_instruction=system_instruction)
prompt_3_shots = construct_few_shot_prompt(first_test_example, few_shot_examples, num_shots=3, system_instruction=system_instruction)

print("--- 1-Shot Prompt Example ---")
print(prompt_1_shot)
print("\n--- 2-Shot Prompt Example ---")
print(prompt_2_shots)
print("\n--- 3-Shot Prompt Example ---")
print(prompt_3_shots)

--- 1-Shot Prompt Example ---
Act as a legal expert and classify the following sentences as 'Relevant' or 'Irrelevant' to privacy policies.

Sentence: Reddit allows other websites to embed public Reddit content via our embed tools.
Label: Relevant

Note: The dataset has a significant class imbalance, with many more 'Irrelevant' examples than 'Relevant' ones.

Sentence:   At Fiverr we care about your privacy.
Label:

--- 2-Shot Prompt Example ---
Act as a legal expert and classify the following sentences as 'Relevant' or 'Irrelevant' to privacy policies.

Sentence: Reddit allows other websites to embed public Reddit content via our embed tools.
Label: Relevant

Sentence: In addition, we are required to list the data access right available to all users in a specific way.
Label: Irrelevant

Note: The dataset has a significant class imbalance, with many more 'Irrelevant' examples than 'Relevant' ones.

Sentence:   At Fiverr we care about your privacy.
Label:

--- 3-Shot Prompt Example ---


# Task
Perform 1, 2, and 3-shot learning experiments on the test dataset using the `gpt-4.1-nano` model, construct prompts without explicit tokenization, and evaluate the results using Precision, Recall, and F1-score, considering the class imbalance in the training data.

In [None]:
# Get 1-shot predictions for a subset of the test dataset
one_shot_predictions_subset = []
subset_size = 1000
test_subset = test_dataset['train'].select(range(min(subset_size, len(test_dataset['train']))))


print(f"Starting 1-shot predictions for a subset of {len(test_subset)} examples...")
for i in range(len(test_subset)):
    if (i + 1) % 100 == 0: # Print progress more frequently for a smaller subset
        print(f"Processing 1-shot example {i + 1}/{len(test_subset)}...")

    example = test_subset[i]
    prompt = construct_few_shot_prompt(example, few_shot_examples, num_shots=1, system_instruction=system_instruction)

    try:
        response = openai.chat.completions.create(
            model=gpt_qa_model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        predicted_label = response.choices[0].message.content.strip()
        one_shot_predictions_subset.append(predicted_label)

    except Exception as e:
        sent_id = example.get('SentID', 'UnknownSentID') if isinstance(example, dict) else 'UnknownSentID'
        print(f"Error getting 1-shot prediction for example with SentID: {sent_id}. Error: {e}")
        one_shot_predictions_subset.append("Error")

print("Finished getting 1-shot predictions for the subset.")
print("Number of 1-shot predictions:", len(one_shot_predictions_subset))

# Proceed to get 2-shot predictions for the subset
two_shot_predictions_subset = []

print(f"\nStarting 2-shot predictions for a subset of {len(test_subset)} examples...")
for i in range(len(test_subset)):
    if (i + 1) % 100 == 0: # Print progress more frequently for a smaller subset
        print(f"Processing 2-shot example {i + 1}/{len(test_subset)}...")

    example = test_subset[i]
    prompt = construct_few_shot_prompt(example, few_shot_examples, num_shots=2, system_instruction=system_instruction)

    try:
        response = openai.chat.completions.create(
            model=gpt_qa_model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        predicted_label = response.choices[0].message.content.strip()
        two_shot_predictions_subset.append(predicted_label)

    except Exception as e:
        sent_id = example.get('SentID', 'UnknownSentID') if isinstance(example, dict) else 'UnknownSentID'
        print(f"Error getting 2-shot prediction for example with SentID: {sent_id}. Error: {e}")
        two_shot_predictions_subset.append("Error")

print("Finished getting 2-shot predictions for the subset.")
print("Number of 2-shot predictions:", len(two_shot_predictions_subset))

# Proceed to get 3-shot predictions for the subset
three_shot_predictions_subset = []

print(f"\nStarting 3-shot predictions for a subset of {len(test_subset)} examples...")
for i in range(len(test_subset)):
    if (i + 1) % 100 == 0: # Print progress more frequently for a smaller subset
        print(f"Processing 3-shot example {i + 1}/{len(test_subset)}...")

    example = test_subset[i]
    prompt = construct_few_shot_prompt(example, few_shot_examples, num_shots=3, system_instruction=system_instruction)

    try:
        response = openai.chat.completions.create(
            model=gpt_qa_model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        predicted_label = response.choices[0].message.content.strip()
        three_shot_predictions_subset.append(predicted_label)

    except Exception as e:
        sent_id = example.get('SentID', 'UnknownSentID') if isinstance(example, dict) else 'UnknownSentID'
        print(f"Error getting 3-shot prediction for example with SentID: {sent_id}. Error: {e}")
        three_shot_predictions_subset.append("Error")

print("Finished getting 3-shot predictions for the subset.")
print("Number of 3-shot predictions:", len(three_shot_predictions_subset))

Starting 1-shot predictions for a subset of 1000 examples...
Processing 1-shot example 100/1000...
Processing 1-shot example 200/1000...
Processing 1-shot example 300/1000...
Processing 1-shot example 400/1000...
Processing 1-shot example 500/1000...
Processing 1-shot example 600/1000...
Processing 1-shot example 700/1000...
Processing 1-shot example 800/1000...
Processing 1-shot example 900/1000...
Processing 1-shot example 1000/1000...
Finished getting 1-shot predictions for the subset.
Number of 1-shot predictions: 1000

Starting 2-shot predictions for a subset of 1000 examples...
Processing 2-shot example 100/1000...
Processing 2-shot example 200/1000...
Processing 2-shot example 300/1000...
Processing 2-shot example 400/1000...
Processing 2-shot example 500/1000...
Processing 2-shot example 600/1000...
Processing 2-shot example 700/1000...
Processing 2-shot example 800/1000...
Processing 2-shot example 900/1000...
Processing 2-shot example 1000/1000...
Finished getting 2-shot pred

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Prepare the ground truth labels for the test subset
# We need to get the 'Any_Relevant' labels from the test_subset and convert them to numerical format (0 and 1)
# The convert_labels_to_int function expects a 'Label' column, so we'll
# temporarily rename 'Any_Relevant' to 'Label' for the mapping.

# Check unique values in 'Any_Relevant' of the subset to be sure they are consistent
unique_any_relevant_subset_values = set(test_subset['Any_Relevant'])
print("Unique values in 'Any_Relevant' of the subset:", unique_any_relevant_subset_values)

if unique_any_relevant_subset_values.issubset({"Irrelevant", "Relevant"}):
    # Temporarily rename 'Any_Relevant' to 'Label' for the mapping function
    test_subset_with_label = test_subset.rename_column("Any_Relevant", "Label")

    # Apply the conversion function to get numerical labels
    # We can reuse the convert_labels_to_int function defined earlier
    def convert_labels_to_int_subset(examples):
        label_map = {"Irrelevant": 0, "Relevant": 1}
        return {"labels": [label_map[label] for label in examples["Label"]]}

    test_subset_with_numerical_labels = test_subset_with_label.map(convert_labels_to_int_subset, batched=True)

    # Extract the numerical ground truth labels
    ground_truth_labels_subset = test_subset_with_numerical_labels['labels']

    print("\nGround truth labels prepared for the subset.")
    print("First 10 ground truth labels:", ground_truth_labels_subset[:10])

    # Convert predicted labels to numerical format (0 and 1)
    label_map = {"Irrelevant": 0, "Relevant": 1}
    # Handle potential 'Error' predictions by mapping them to a default or excluding them
    # Here, we'll map 'Error' to -1 or some other indicator and filter them out for metric computation
    def convert_predictions_to_int(predictions, label_map):
        numerical_predictions = []
        valid_ground_truth_labels = []
        for i, pred in enumerate(predictions):
            if pred in label_map:
                numerical_predictions.append(label_map[pred])
                valid_ground_truth_labels.append(ground_truth_labels_subset[i])
            else:
                # Optionally handle 'Error' or unexpected predictions, e.g., skip them
                print(f"Skipping prediction '{pred}' at index {i} due to unexpected value.")
        return numerical_predictions, valid_ground_truth_labels


    # Convert predictions for each shot number
    one_shot_numerical_predictions, one_shot_eval_labels = convert_predictions_to_int(one_shot_predictions_subset, label_map)
    two_shot_numerical_predictions, two_shot_eval_labels = convert_predictions_to_int(two_shot_predictions_subset, label_map)
    three_shot_numerical_predictions, three_shot_eval_labels = convert_predictions_to_int(three_shot_predictions_subset, label_map)


    # --- Evaluate 1-Shot Performance ---
    if one_shot_numerical_predictions:
        print("\n--- 1-Shot Evaluation Results ---")
        # Use zero_division parameter to handle cases where there are no predicted samples for a class
        one_shot_precision = precision_score(one_shot_eval_labels, one_shot_numerical_predictions, average='weighted', zero_division=0)
        one_shot_recall = recall_score(one_shot_eval_labels, one_shot_numerical_predictions, average='weighted', zero_division=0)
        one_shot_f1 = f1_score(one_shot_eval_labels, one_shot_numerical_predictions, average='weighted', zero_division=0)
        one_shot_accuracy = accuracy_score(one_shot_eval_labels, one_shot_numerical_predictions)

        print(f"Precision: {one_shot_precision:.4f}")
        print(f"Recall: {one_shot_recall:.4f}")
        print(f"F1-score: {one_shot_f1:.4f}")
        print(f"Accuracy: {one_shot_accuracy:.4f}")
    else:
        print("\n--- 1-Shot Evaluation Results ---")
        print("No valid predictions to evaluate.")


    # --- Evaluate 2-Shot Performance ---
    if two_shot_numerical_predictions:
        print("\n--- 2-Shot Evaluation Results ---")
        two_shot_precision = precision_score(two_shot_eval_labels, two_shot_numerical_predictions, average='weighted', zero_division=0)
        two_shot_recall = recall_score(two_shot_eval_labels, two_shot_numerical_predictions, average='weighted', zero_division=0)
        two_shot_f1 = f1_score(two_shot_eval_labels, two_shot_numerical_predictions, average='weighted', zero_division=0)
        two_shot_accuracy = accuracy_score(two_shot_eval_labels, two_shot_numerical_predictions)

        print(f"Precision: {two_shot_precision:.4f}")
        print(f"Recall: {two_shot_recall:.4f}")
        print(f"F1-score: {two_shot_f1:.4f}")
        print(f"Accuracy: {two_shot_accuracy:.4f}")
    else:
        print("\n--- 2-Shot Evaluation Results ---")
        print("No valid predictions to evaluate.")

    # --- Evaluate 3-Shot Performance ---
    if three_shot_numerical_predictions:
        print("\n--- 3-Shot Evaluation Results ---")
        three_shot_precision = precision_score(three_shot_eval_labels, three_shot_numerical_predictions, average='weighted', zero_division=0)
        three_shot_recall = recall_score(three_shot_eval_labels, three_shot_numerical_predictions, average='weighted', zero_division=0)
        three_shot_f1 = f1_score(three_shot_eval_labels, three_shot_numerical_predictions, average='weighted', zero_division=0)
        three_shot_accuracy = accuracy_score(three_shot_eval_labels, three_shot_numerical_predictions)

        print(f"Precision: {three_shot_precision:.4f}")
        print(f"Recall: {three_shot_recall:.4f}")
        print(f"F1-score: {three_shot_f1:.4f}")
        print(f"Accuracy: {three_shot_accuracy:.4f}")
    else:
        print("\n--- 3-Shot Evaluation Results ---")
        print("No valid predictions to evaluate.")

else:
    print("\n'Any_Relevant' column in the subset contains unexpected values. Cannot convert to numerical labels for evaluation.")

Unique values in 'Any_Relevant' of the subset: {'Irrelevant', 'Relevant'}

Ground truth labels prepared for the subset.
First 10 ground truth labels: [0, 0, 0, 1, 0, 1, 0, 0, 1, 0]
Skipping prediction 'Label: Relevant' at index 901 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 29 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 148 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 150 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 166 due to unexpected value.
Skipping prediction 'Label: Irrelevant' at index 230 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 239 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 276 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 341 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 364 due to unexpected value.
Skipping prediction 'Label: Relevant' at index 394 due to 

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming 'one_shot_numerical_predictions', 'two_shot_numerical_predictions',
# 'three_shot_numerical_predictions', and their corresponding
# 'one_shot_eval_labels', 'two_shot_eval_labels', 'three_shot_eval_labels'
# are available from the previous execution and contain only valid predictions/labels.

label_names = ["Irrelevant", "Relevant"]

# --- Evaluate 1-Shot Class-wise Performance ---
if one_shot_numerical_predictions:
    print("\n--- 1-Shot Class-wise Evaluation Results ---")
    # Use zero_division parameter to handle cases where there are no predicted samples for a class
    one_shot_precision_classwise = precision_score(one_shot_eval_labels, one_shot_numerical_predictions, average=None, zero_division=0)
    one_shot_recall_classwise = recall_score(one_shot_eval_labels, one_shot_numerical_predictions, average=None, zero_division=0)
    one_shot_f1_classwise = f1_score(one_shot_eval_labels, one_shot_numerical_predictions, average=None, zero_division=0)

    for i, label_name in enumerate(label_names):
        print(f"  {label_name}:")
        print(f"    Precision: {one_shot_precision_classwise[i]:.4f}")
        print(f"    Recall: {one_shot_recall_classwise[i]:.4f}")
        print(f"    F1-score: {one_shot_f1_classwise[i]:.4f}")
else:
    print("\n--- 1-Shot Class-wise Evaluation Results ---")
    print("No valid predictions to evaluate class-wise.")


# --- Evaluate 2-Shot Class-wise Performance ---
if two_shot_numerical_predictions:
    print("\n--- 2-Shot Class-wise Evaluation Results ---")
    two_shot_precision_classwise = precision_score(two_shot_eval_labels, two_shot_numerical_predictions, average=None, zero_division=0)
    two_shot_recall_classwise = recall_score(two_shot_eval_labels, two_shot_numerical_predictions, average=None, zero_division=0)
    two_shot_f1_classwise = f1_score(two_shot_eval_labels, two_shot_numerical_predictions, average=None, zero_division=0)

    for i, label_name in enumerate(label_names):
        print(f"  {label_name}:")
        print(f"    Precision: {two_shot_precision_classwise[i]:.4f}")
        print(f"    Recall: {two_shot_recall_classwise[i]:.4f}")
        print(f"    F1-score: {two_shot_f1_classwise[i]:.4f}")
else:
    print("\n--- 2-Shot Class-wise Evaluation Results ---")
    print("No valid predictions to evaluate class-wise.")

# --- Evaluate 3-Shot Class-wise Performance ---
if three_shot_numerical_predictions:
    print("\n--- 3-Shot Class-wise Evaluation Results ---")
    three_shot_precision_classwise = precision_score(three_shot_eval_labels, three_shot_numerical_predictions, average=None, zero_division=0)
    three_shot_recall_classwise = recall_score(three_shot_eval_labels, three_shot_numerical_predictions, average=None, zero_division=0)
    three_shot_f1_classwise = f1_score(three_shot_eval_labels, three_shot_numerical_predictions, average=None, zero_division=0)

    for i, label_name in enumerate(label_names):
        print(f"  {label_name}:")
        print(f"    Precision: {three_shot_precision_classwise[i]:.4f}")
        print(f"    Recall: {three_shot_recall_classwise[i]:.4f}")
        print(f"    F1-score: {three_shot_f1_classwise[i]:.4f}")
else:
    print("\n--- 3-Shot Class-wise Evaluation Results ---")
    print("No valid predictions to evaluate class-wise.")


--- 1-Shot Class-wise Evaluation Results ---
  Irrelevant:
    Precision: 0.9947
    Recall: 0.1958
    F1-score: 0.3272
  Relevant:
    Precision: 0.0469
    Recall: 0.9744
    F1-score: 0.0895

--- 2-Shot Class-wise Evaluation Results ---
  Irrelevant:
    Precision: 1.0000
    Recall: 0.2367
    F1-score: 0.3828
  Relevant:
    Precision: 0.0515
    Recall: 1.0000
    F1-score: 0.0979

--- 3-Shot Class-wise Evaluation Results ---
  Irrelevant:
    Precision: 0.9908
    Recall: 0.2240
    F1-score: 0.3653
  Relevant:
    Precision: 0.0473
    Recall: 0.9487
    F1-score: 0.0901


## Experiment Details

This section outlines the methodology employed for conducting a few-shot learning experiment to classify sentences from privacy policies as 'Relevant' or 'Irrelevant'.

**Task:** The primary objective was to set up and execute a 1, 2, and 3-shot learning experiment using a pre-loaded language model (`gpt-4.1-nano-2025-04-14`) on a test dataset. A crucial aspect of this experiment was the inclusion of information regarding class imbalance within the prompts to potentially mitigate its impact on model performance. The performance for each shot number was subsequently evaluated using standard classification metrics.

**Model:** The experiment utilized the `gpt-4.1-nano-2025-04-14` model accessed via the OpenAI API.

**Dataset:** The experiment was conducted on a test dataset, referred to as `test_dataset`, which contains sentences labeled as either 'Relevant' or 'Irrelevant' to privacy policies.

**Few-Shot Setup:**
For each shot number (1, 2, and 3), a set of few-shot examples was randomly selected from a pool of labeled examples (`relevant_examples` and `irrelevant_examples`). The selection aimed to include an equal number of 'Relevant' and 'Irrelevant' examples where possible, adjusting for odd shot numbers by including one extra example from one class. These selected examples were used to guide the model's predictions for unseen test sentences.

**Prompt Construction:**
Prompts were constructed for each test data point. Each prompt included:
1.  A system instruction: "Act as a legal expert and classify the following sentences as 'Relevant' or 'Irrelevant' to privacy policies."
2.  The selected few-shot examples, formatted as "Sentence: [sentence]\nLabel: [label]".
3.  A note explicitly stating the class imbalance in the dataset: "Note: The dataset has a significant class imbalance, with many more 'Irrelevant' examples than 'Relevant' ones."
4.  The test sentence to be classified, formatted as "Sentence: [test sentence]\nLabel:".

The prompts were constructed without explicit tokenization, relying on the model's inherent processing capabilities.

**Experiment Procedure:**
For each shot number (1, 2, and 3), the following steps were performed on a subset of the test dataset (1000 examples for evaluation purposes):
1.  A prompt was constructed for each example in the test subset using the `construct_few_shot_prompt` function, incorporating the selected few-shot examples and the class imbalance note.
2.  The constructed prompt was sent to the `gpt-4.1-nano-2025-04-14` model via the OpenAI API (`openai.chat.completions.create`).
3.  The predicted label was extracted from the model's response. The response format was observed to be the predicted label word ('Relevant' or 'Irrelevant').
4.  The predicted labels for all examples in the subset were collected for each shot number.

**Evaluation:**
The performance of the model for each shot number was evaluated using the following metrics:
-   **Precision:** The ratio of correctly predicted 'Relevant' instances to the total predicted 'Relevant' instances.
-   **Recall:** The ratio of correctly predicted 'Relevant' instances to the total actual 'Relevant' instances.
-   **F1-score:** The harmonic mean of Precision and Recall, providing a balanced measure of the model's performance.
-   **Accuracy:** The ratio of correctly predicted instances (both 'Relevant' and 'Irrelevant') to the total number of instances.

The ground truth labels from the test subset were converted to a numerical format (0 for 'Irrelevant', 1 for 'Relevant') for metric computation. Predicted labels were also converted to the same numerical format, with unexpected prediction formats being noted and excluded from the evaluation. Weighted averaging was used for Precision, Recall, and F1-score to account for the class imbalance in the evaluation subset.

The evaluation results for each shot number (1, 2, and 3) were computed and reported.