In [None]:
import json
import os

data_dir = 'PLUE/PLUE-main/data/policyqa' # Ensure this path is correct and accessible

train_file_path = os.path.join(data_dir, 'train.json')

try:
    with open(train_file_path, 'r') as f:
        data = json.load(f)

    # Assuming the relevant data is in a list under the key 'data'
    nested_data = data.get('data', [])

    print(f"Successfully loaded data from {train_file_path}")
    print(f"Total items found in 'data' field: {len(nested_data)}")

    if nested_data:
        print("\nStructure and sample of the first item in 'data':")
        # Print the full first item to see its structure
        print(json.dumps(nested_data[0], indent=2))

        # Optionally, print a few more items
        # print("\nSample of the next 2 items in 'data':")
        # for i in range(1, min(3, len(nested_data))):
        #     print(f"\nItem {i+1}:")
        #     print(json.dumps(nested_data[i], indent=2))
    else:
        print("The 'data' field is empty or not found in the JSON.")

except FileNotFoundError:
    print(f"Error: File not found at {train_file_path}. Please check the data_dir path.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {train_file_path}. Please check the file content.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded data from PLUE/PLUE-main/data/policyqa\train.json
Total items found in 'data' field: 75

Structure and sample of the first item in 'data':
{
  "title": "sidearmsports.com",
  "paragraphs": [
    {
      "qas": [
        {
          "question": "How do they collect information about users?",
          "type": "First Party Collection/Use|||Collection Mode|||Explicit",
          "id": "qr0541estkuqixbp",
          "answers": [
            {
              "text": "personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and",
              "answer_start": 90
            },
            {
              "text": "you supply",
              "answer_start": 111
            },
            {
              "text": "personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email

In [None]:
import json
import os

data_dir = 'PLUE/PLUE-main/data/policyqa' # Ensure this path is correct and accessible
output_dir = '/content/openai_finetuning_data'
os.makedirs(output_dir, exist_ok=True)

def process_json_file(file_path, output_path):
    """Loads data from a nested JSON file and formats it for OpenAI chat model fine-tuning."""
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Assuming the relevant data is in a list under the key 'data'
    nested_data = data.get('data', [])
    print(f"Processing {len(nested_data)} items from {file_path}")

    with open(output_path, 'w') as outfile:
        for item in nested_data:
            # Each item in nested_data corresponds to one policy document (or similar)
            title = item.get('title', 'N/A')
            paragraphs = item.get('paragraphs', [])

            for paragraph in paragraphs:
                context = paragraph.get('context', '')
                qas = paragraph.get('qas', [])

                for qa in qas:
                    question = qa.get('question', '')
                    answers = qa.get('answers', [])

                    # We will create one training example per question-answer pair.
                    # Format: User asks question based on context, Assistant provides answer(s).

                    messages = []

                    # Add a system message to set the behavior of the model
                    messages.append({"role": "system", "content": "You are a helpful assistant that answers questions accurately based on the provided legal policy context."})

                    # User message: Combine context and question
                    user_content = f"Context: {context}\n\nQuestion: {question}"
                    messages.append({"role": "user", "content": user_content})

                    # Assistant message: Combine answers. Assuming answers is a list of dicts with 'text' key.
                    # If there are multiple answers, you might want to format them clearly.
                    assistant_answers = [ans.get('text', '') for ans in answers if ans.get('text')]
                    assistant_content = " ".join(assistant_answers) # Join multiple answers

                    # Add the assistant message only if there are answers
                    if assistant_content:
                         messages.append({"role": "assistant", "content": assistant_content})
                    else:
                        # If there are no answers, you might choose to skip this example
                        # or provide a default assistant response. Skipping for now.
                        print(f"Skipping QA pair with no answers in {file_path}: Question - {question}")
                        continue # Skip to the next QA pair

                    # Each line in the JSONL must be a JSON object with a "messages" key
                    openai_format = {"messages": messages}

                    # Write the formatted example to the output file
                    outfile.write(json.dumps(openai_format) + '\n')

    print(f"Formatted data saved to {output_dir}. JSONL files now include a system message.")

# Process train, test, and dev files
# Ensure the data_dir is correct and accessible in your environment
process_json_file(f'{data_dir}/train.json', f'{output_dir}/train.jsonl')
process_json_file(f'{data_dir}/test.json', f'{output_dir}/test.jsonl')
process_json_file(f'{data_dir}/dev.json', f'{output_dir}/dev.jsonl')

Processing 75 items from PLUE/PLUE-main/data/policyqa/train.json
Formatted data saved to /content/openai_finetuning_data. JSONL files now include a system message.
Processing 20 items from PLUE/PLUE-main/data/policyqa/test.json
Formatted data saved to /content/openai_finetuning_data. JSONL files now include a system message.
Processing 20 items from PLUE/PLUE-main/data/policyqa/dev.json
Formatted data saved to /content/openai_finetuning_data. JSONL files now include a system message.


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Define the paths to your newly prepared data files
train_file_path = '/content/openai_finetuning_data/train.jsonl'
test_file_path = '/content/openai_finetuning_data/test.jsonl'
dev_file_path = '/content/openai_finetuning_data/dev.jsonl'

# Upload the training file
print(f"Uploading training file: {train_file_path}")
train_upload_response = openai.files.create(
  file=open(train_file_path, "rb"),
  purpose="fine-tune"
)
train_file_id = train_upload_response.id
print(f"Training file uploaded successfully with ID: {train_file_id}")

# Upload the test file
print(f"Uploading test file: {test_file_path}")
test_upload_response = openai.files.create(
  file=open(test_file_path, "rb"),
  purpose="fine-tune" # Can also be 'eval' if supported for the model
)
test_file_id = test_upload_response.id
print(f"Test file uploaded successfully with ID: {test_file_id}")

# Upload the dev file
print(f"Uploading dev file: {dev_file_path}")
dev_upload_response = openai.files.create(
  file=open(dev_file_path, "rb"),
  purpose="fine-tune" # Can also be 'eval' if supported for the model
)
dev_file_id = dev_upload_response.id
print(f"Dev file uploaded successfully with ID: {dev_file_id}")

# You can now use the new file_id(s) in the next step to initiate fine-tuning
print("\nNew File IDs for fine-tuning:")
print(f"Train file ID: {train_file_id}")
print(f"Test file ID: {test_file_id}")
print(f"Dev file ID: {dev_file_id}")

Uploading training file: /content/openai_finetuning_data/train.jsonl
Training file uploaded successfully with ID: file-AzfmMjKEQ6CRapZMEYWtfg
Uploading test file: /content/openai_finetuning_data/test.jsonl
Test file uploaded successfully with ID: file-9AFd7KumY1pz13ve18gqMJ
Uploading dev file: /content/openai_finetuning_data/dev.jsonl
Dev file uploaded successfully with ID: file-Qex4nqL8JbPrb5zmL3Zta7

New File IDs for fine-tuning:
Train file ID: file-AzfmMjKEQ6CRapZMEYWtfg
Test file ID: file-9AFd7KumY1pz13ve18gqMJ
Dev file ID: file-Qex4nqL8JbPrb5zmL3Zta7


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Your NEW file IDs from the previous successful upload step
train_file_id = "file-AzfmMjKEQ6CRapZMEYWtfg"
dev_file_id = "file-Qex4nqL8JbPrb5zmL3Zta7" # Optional, remove if not using dev set for validation

print(f"Initiating fine-tuning job with training file ID: {train_file_id}")
if dev_file_id: # Check if dev_file_id is not empty or None
    print(f"Using validation file ID: {dev_file_id}")

try:
    fine_tuning_job = openai.fine_tuning.jobs.create(
      # Choose a supported base model. gpt-3.5-turbo is a common choice for fine-tuning.
      # Replace with your desired supported model name if different
      model="gpt-4.1-nano-2025-04-14",
      training_file=train_file_id,
      validation_file=dev_file_id # Optional: include if you uploaded a validation file
    )

    print(f"Fine-tuning job created successfully!")
    print(f"Job ID: {fine_tuning_job.id}")
    print(f"Status: {fine_tuning_job.status}")

    # You can use the Job ID to monitor the status or cancel the job later.
    print("\nTo monitor the job status, you can use the following code (make sure API key is set):")
    print(f"# job_id = \"{fine_tuning_job.id}\"")
    print("# # Then use the monitoring code from earlier")

except Exception as e:
    print(f"Error initiating fine-tuning job: {e}")

Initiating fine-tuning job with training file ID: file-AzfmMjKEQ6CRapZMEYWtfg
Using validation file ID: file-Qex4nqL8JbPrb5zmL3Zta7
Fine-tuning job created successfully!
Job ID: ftjob-9p328X67hE32QJPsDuIHMA1y
Status: validating_files

To monitor the job status, you can use the following code (make sure API key is set):
# job_id = "ftjob-9p328X67hE32QJPsDuIHMA1y"
# # Then use the monitoring code from earlier


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Replace with your fine-tuning job ID
job_id = "ftjob-9p328X67hE32QJPsDuIHMA1y"

print(f"Checking status for fine-tuning job: {job_id}")

try:
    fine_tuning_job_status = openai.fine_tuning.jobs.retrieve(job_id)

    print(f"Job ID: {fine_tuning_job_status.id}")
    print(f"Status: {fine_tuning_job_status.status}")
    if fine_tuning_job_status.fine_tuned_model:
         print(f"Fine-tuned model (if succeeded): {fine_tuning_job_status.fine_tuned_model}")
    if fine_tuning_job_status.error:
        print("\nError Details (if failed):")
        print(f"  Code: {fine_tuning_job_status.error.code}")
        print(f"  Message: {fine_tuning_job_status.error.message}")
        # print(f"  Param: {fine_tuning_job_status.error.param}") # Uncomment if needed

except Exception as e:
    print(f"Error retrieving job status: {e}")

Checking status for fine-tuning job: ftjob-9p328X67hE32QJPsDuIHMA1y
Job ID: ftjob-9p328X67hE32QJPsDuIHMA1y
Status: succeeded
Fine-tuned model (if succeeded): ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW

Error Details (if failed):
  Code: None
  Message: None


In [None]:
import openai
import os
import json
from collections import Counter
import re

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY')

# Replace with the name of your fine-tuned QA model from the successful job output (cell ea3c9a41)
fine_tuned_qa_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW"

# Path to the development dataset (ensure this path is correct)
dev_file_path = '/content/openai_finetuning_data/dev.jsonl'

# Function to normalize text for F1 and EM calculation
def normalize_text(text):
    """Lowercases, removes punctuation and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Function to calculate F1 and Exact Match
def calculate_f1_em(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0, 0

    precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
    recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

    return f1, exact_match

# Load the development data
dev_data = []
try:
    with open(dev_file_path, 'r') as f:
        for line in f:
            dev_data.append(json.loads(line))
    print(f"Loaded {len(dev_data)} examples from {dev_file_path}")
except FileNotFoundError:
    print(f"Error: Dev file not found at {dev_file_path}. Please check the path.")
    dev_data = None
except Exception as e:
    print(f"An error occurred loading dev data: {e}")
    dev_data = None


if dev_data:
    total_f1 = 0
    total_em = 0
    evaluation_count = 0
    progress_interval = 100 # Print progress every 100 examples

    print(f"\nEvaluating fine-tuned model '{fine_tuned_qa_model_name}' on the dev set...")

    # Note: Evaluating large datasets can be time-consuming and incur API costs.
    # Consider evaluating on a smaller subset first if needed.
    # For demonstration, let's evaluate on the first 10 examples:
    # dev_data_subset = dev_data[:10]
    # print(f"Evaluating on a subset of {len(dev_data_subset)} examples.")
    # data_to_evaluate = dev_data_subset
    data_to_evaluate = dev_data

    for i, example in enumerate(data_to_evaluate):
        messages = example.get('messages', [])
        if not messages:
            # print(f"Skipping example with no messages: {example}") # Optional: uncomment for detailed skipping
            continue

        # Find the user message and the assistant message (ground truth)
        user_message_content = ""
        ground_truth_answer = ""
        conversation_for_inference = []

        for msg in messages:
            if msg.get('role') == 'user':
                user_message_content = msg.get('content', '')
                conversation_for_inference.append({"role": "user", "content": user_message_content})
            elif msg.get('role') == 'assistant':
                ground_truth_answer = msg.get('content', '')
                # Only include messages up to the user turn for inference
                # If you include assistant turn, the model just sees the answer in prompt
                # conversation_for_inference.append({"role": "assistant", "content": ground_truth_answer})


        if user_message_content and ground_truth_answer:
            try:
                # Use the fine-tuned model for inference
                response = openai.chat.completions.create(
                    model=fine_tuned_qa_model_name,
                    messages=conversation_for_inference,
                    max_tokens=150 # Adjust max_tokens as needed for typical answer length
                )

                # Extract the model's predicted answer
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    predicted_answer = response.choices[0].message.content.strip()

                    # Calculate metrics
                    f1, em = calculate_f1_em(predicted_answer, ground_truth_answer)

                    total_f1 += f1
                    total_em += em
                    evaluation_count += 1

                    # Print progress
                    if (i + 1) % progress_interval == 0:
                        print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")

                    # Optional: Print individual results
                    # print(f"\nUser: {user_message_content[:100]}...")
                    # print(f"Ground Truth: {ground_truth_answer[:100]}...")
                    # print(f"Prediction: {predicted_answer[:100]}...")
                    # print(f"F1: {f1:.4f}, EM: {em:.4f}")

                else:
                    # print(f"Skipping example due to empty model response: {conversation_for_inference}") # Optional: uncomment for details
                    pass # Skip examples with no valid prediction

            except Exception as e:
                print(f"Error during inference or evaluation for example {i + 1}: {e}")
                # print(f"Example data: {conversation_for_inference}") # Optional: uncomment for details
                continue # Continue to the next example even if one fails

    # Calculate average metrics
    if evaluation_count > 0:
        average_f1 = total_f1 / evaluation_count
        average_em = total_em / evaluation_count
        print(f"\n--- Evaluation Results ---")
        print(f"Evaluated on {evaluation_count} examples.")
        print(f"Average F1 Score: {average_f1:.4f}")
        print(f"Average Exact Match Score: {average_em:.4f}")
        print(f"-------------------------")
    else:
        print("\nNo examples were successfully evaluated.")

Loaded 3809 examples from /content/openai_finetuning_data/dev.jsonl

Evaluating fine-tuned model 'ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW' on the dev set...
Processed 100/3809 examples...
Processed 200/3809 examples...
Processed 300/3809 examples...
Processed 400/3809 examples...
Processed 500/3809 examples...
Processed 600/3809 examples...
Processed 700/3809 examples...
Processed 800/3809 examples...
Processed 900/3809 examples...
Processed 1000/3809 examples...
Processed 1100/3809 examples...
Processed 1200/3809 examples...
Processed 1300/3809 examples...
Processed 1400/3809 examples...
Processed 1500/3809 examples...
Processed 1600/3809 examples...
Processed 1700/3809 examples...
Processed 1800/3809 examples...
Processed 1900/3809 examples...
Processed 2000/3809 examples...
Processed 2100/3809 examples...
Processed 2200/3809 examples...
Processed 2300/3809 examples...
Processed 2400/3809 examples...
Processed 2500/3809 examples...
Processed 2600/3809 examples...
Processed 2700

In [None]:
import json
import os

input_dir = '/content/openai_finetuning_data_classification'
output_dir = '/content/openai_finetuning_data_classification_chat_format' # New directory for chat formatted data
os.makedirs(output_dir, exist_ok=True)

train_input_path = os.path.join(input_dir, 'train_classification.jsonl')
test_input_path = os.path.join(input_dir, 'test_classification.jsonl')

train_output_path = os.path.join(output_dir, 'train_classification_chat_format.jsonl')
test_output_path = os.path.join(output_dir, 'test_classification_chat_format.jsonl')

def reformat_to_chat_format(input_path, output_path):
    """Reads prompt/completion JSONL and reformats to chat message JSONL."""
    print(f"Reading from {input_path} and writing to {output_path}")
    processed_count = 0
    skipped_count = 0

    with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
        for line in infile:
            try:
                example = json.loads(line)
                prompt = example.get('prompt', '')
                completion = example.get('completion', '').strip() # Get completion and strip leading/trailing whitespace

                if not prompt or not completion:
                    print(f"Skipping line due to missing prompt or completion: {line.strip()}")
                    skipped_count += 1
                    continue # Skip examples without both prompt and completion

                # Create messages list in chat format
                messages = []

                # Add a system message relevant to classification of legal text
                messages.append({"role": "system", "content": "Classify the following legal text as relevant or irrelevant."})

                # User message: Present the text to be classified
                messages.append({"role": "user", "content": prompt})

                # Assistant message: Provide the classification label as the response
                messages.append({"role": "assistant", "content": completion}) # Use the stripped completion as the label

                # Write the new chat formatted example
                openai_format = {"messages": messages}
                outfile.write(json.dumps(openai_format) + '\n')
                processed_count += 1

            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line.strip()}")
                skipped_count += 1
            except Exception as e:
                print(f"Skipping line due to error: {e} - Line: {line.strip()}")
                skipped_count += 1

    print(f"Finished processing {input_path}. Processed {processed_count} examples, skipped {skipped_count}.")

# Reformat training and testing data
reformat_to_chat_format(train_input_path, train_output_path)
reformat_to_chat_format(test_input_path, test_output_path)

print(f"\nClassification data reformatted to chat format and saved to {output_dir}")

Reading from /content/openai_finetuning_data_classification\train_classification.jsonl and writing to /content/openai_finetuning_data_classification_chat_format\train_classification_chat_format.jsonl
Finished processing /content/openai_finetuning_data_classification\train_classification.jsonl. Processed 185200 examples, skipped 0.
Reading from /content/openai_finetuning_data_classification\test_classification.jsonl and writing to /content/openai_finetuning_data_classification_chat_format\test_classification_chat_format.jsonl
Finished processing /content/openai_finetuning_data_classification\test_classification.jsonl. Processed 62150 examples, skipped 0.

Classification data reformatted to chat format and saved to /content/openai_finetuning_data_classification_chat_format


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Define the paths to your newly prepared chat-formatted classification data files
train_classification_chat_file_path = '/content/openai_finetuning_data_classification_chat_format/train_classification_chat_format.jsonl'
test_classification_chat_file_path = '/content/openai_finetuning_data_classification_chat_format/test_classification_chat_format.jsonl'

# Upload the training file for classification (chat format)
print(f"Uploading chat-formatted classification training file: {train_classification_chat_file_path}")
train_classification_chat_upload_response = openai.files.create(
  file=open(train_classification_chat_file_path, "rb"),
  purpose="fine-tune"
)
train_classification_chat_file_id = train_classification_chat_upload_response.id
print(f"Chat-formatted classification training file uploaded successfully with ID: {train_classification_chat_file_id}")

# Upload the test file for classification (chat format - optional, good for validation)
print(f"Uploading chat-formatted classification test file: {test_classification_chat_file_path}")
test_classification_chat_upload_response = openai.files.create(
  file=open(test_classification_chat_file_path, "rb"),
  purpose="fine-tune" # Can also be 'eval' if supported for the model
)
test_classification_chat_file_id = test_classification_chat_upload_response.id
print(f"Chat-formatted classification test file uploaded successfully with ID: {test_classification_chat_file_id}")


# You can now use the new file_id(s) in the next step to initiate fine-tuning
print("\nNew File IDs for chat-formatted classification fine-tuning:")
print(f"Train chat-formatted classification file ID: {train_classification_chat_file_id}")
print(f"Test chat-formatted classification file ID: {test_classification_chat_file_id}")

Uploading chat-formatted classification training file: /content/openai_finetuning_data_classification_chat_format/train_classification_chat_format.jsonl
Chat-formatted classification training file uploaded successfully with ID: file-871meGrcx4jZ4MTSBQvqfX
Uploading chat-formatted classification test file: /content/openai_finetuning_data_classification_chat_format/test_classification_chat_format.jsonl
Chat-formatted classification test file uploaded successfully with ID: file-LqkijrjdCYKm8Dofjkfhaz

New File IDs for chat-formatted classification fine-tuning:
Train chat-formatted classification file ID: file-871meGrcx4jZ4MTSBQvqfX
Test chat-formatted classification file ID: file-LqkijrjdCYKm8Dofjkfhaz


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Your NEW file IDs for chat-formatted classification fine-tuning
train_classification_chat_file_id = "file-871meGrcx4jZ4MTSBQvqfX"
test_classification_chat_file_id = "file-LqkijrjdCYKm8Dofjkfhaz" # Optional, remove if not using test set for validation

print(f"Initiating chat-formatted classification fine-tuning job with training file ID: {train_classification_chat_file_id}")
if test_classification_chat_file_id: # Check if test_classification_chat_file_id is not empty or None
    print(f"Using validation file ID: {test_classification_chat_file_id}")

try:
    fine_tuning_job_classification_chat = openai.fine_tuning.jobs.create(
      # Choose a supported chat model (e.g., gpt-3.5-turbo, gpt-4.1-nano-2025-04-14 if supported for FT)
      model="gpt-4.1-nano-2025-04-14",
      training_file=train_classification_chat_file_id,
      validation_file=test_classification_chat_file_id # Optional: include if you uploaded a validation file
    )

    print(f"Chat-formatted classification fine-tuning job created successfully!")
    print(f"Job ID: {fine_tuning_job_classification_chat.id}")
    print(f"Status: {fine_tuning_job_classification_chat.status}")

    # You can use the Job ID to monitor the status or cancel the job later.
    print("\nTo monitor the chat-formatted classification job status, you can use the following code (make sure API key is set):")
    print(f"# job_id_classification_chat = \"{fine_tuning_job_classification_chat.id}\"")
    print("# # Then use the monitoring code from earlier")

except Exception as e:
    print(f"Error initiating chat-formatted classification fine-tuning job: {e}")

Initiating chat-formatted classification fine-tuning job with training file ID: file-871meGrcx4jZ4MTSBQvqfX
Using validation file ID: file-LqkijrjdCYKm8Dofjkfhaz
Chat-formatted classification fine-tuning job created successfully!
Job ID: ftjob-W2hRPXlkXD6cXRWssqgF4UVs
Status: validating_files

To monitor the chat-formatted classification job status, you can use the following code (make sure API key is set):
# job_id_classification_chat = "ftjob-W2hRPXlkXD6cXRWssqgF4UVs"
# # Then use the monitoring code from earlier


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Replace with your chat-formatted classification fine-tuning job ID
job_id_classification_chat = "ftjob-W2hRPXlkXD6cXRWssqgF4UVs"

print(f"Checking status for chat-formatted classification fine-tuning job: {job_id_classification_chat}")

try:
    fine_tuning_job_status = openai.fine_tuning.jobs.retrieve(job_id_classification_chat)

    print(f"Job ID: {fine_tuning_job_status.id}")
    print(f"Status: {fine_tuning_job_status.status}")
    if fine_tuning_job_status.fine_tuned_model:
         print(f"Fine-tuned model (if succeeded): {fine_tuning_job_status.fine_tuned_model}")
    if fine_tuning_job_status.error:
        print("\nError Details (if failed):")
        print(f"  Code: {fine_tuning_job_status.error.code}")
        print(f"  Message: {fine_tuning_job_status.error.message}")
        # print(f"  Param: {fine_tuning_job_status.error.param}") # Uncomment if needed

except Exception as e:
    print(f"Error retrieving job status: {e}")

Checking status for chat-formatted classification fine-tuning job: ftjob-W2hRPXlkXD6cXRWssqgF4UVs
Job ID: ftjob-W2hRPXlkXD6cXRWssqgF4UVs
Status: succeeded
Fine-tuned model (if succeeded): ft:gpt-4.1-nano-2025-04-14:personal::BzIXr0Db

Error Details (if failed):
  Code: None
  Message: None


In [None]:
import openai
import os
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY')

# Replace with the name of your fine-tuned classification model from the successful job output
# (Check the status of job ftjob-W2hRPXlkXD6cXRWssqgF4UVs using cell 088a70e1)
fine_tuned_classification_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzIXr0Db"

# Path to the development dataset for classification (chat format)
# Use the path to the test set JSONL as it was uploaded as validation/test data
dev_classification_file_path = '/content/openai_finetuning_data_classification_chat_format/test_classification_chat_format.jsonl'


# Load the development data
dev_data_classification = []
try:
    with open(dev_classification_file_path, 'r') as f:
        for line in f:
            dev_data_classification.append(json.loads(line))
    print(f"Loaded {len(dev_data_classification)} examples from {dev_classification_file_path}")
except FileNotFoundError:
    print(f"Error: Dev classification file not found at {dev_classification_file_path}. Cannot perform evaluation.")
    dev_data_classification = None
except Exception as e:
    print(f"An error occurred loading dev classification data: {e}")
    dev_data_classification = None


if dev_data_classification and fine_tuned_classification_model_name != "":
    ground_truth_labels = []
    predicted_labels = []
    evaluation_count = 0
    progress_interval = 100 # Print progress every 100 examples

    print(f"\nEvaluating fine-tuned classification model '{fine_tuned_classification_model_name}' on the test set...")

    # Evaluate on all examples in the dev set
    data_to_evaluate = dev_data_classification

    for i, example in enumerate(data_to_evaluate):
        messages = example.get('messages', [])
        if not messages:
            continue

        # The last message in the training data is the assistant's response (the label)
        # For inference, we only send messages up to the user's turn
        conversation_for_inference = [msg for msg in messages if msg.get('role') != 'assistant']

        # Extract the ground truth label from the assistant's message
        ground_truth_label = ""
        for msg in messages:
            if msg.get('role') == 'assistant':
                ground_truth_label = msg.get('content', '').strip()
                break # Assuming only one assistant turn at the end


        if conversation_for_inference and ground_truth_label:
            try:
                # Use the fine-tuned model for inference
                response = openai.chat.completions.create(
                    model=fine_tuned_classification_model_name,
                    messages=conversation_for_inference,
                    max_tokens=10, # Classification labels are short
                    temperature=0.0 # Use deterministic sampling for classification
                )

                # Extract the model's predicted label
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    predicted_label = response.choices[0].message.content.strip()

                    ground_truth_labels.append(ground_truth_label)
                    predicted_labels.append(predicted_label)
                    evaluation_count += 1

                    # Print progress
                    if (i + 1) % progress_interval == 0:
                        print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")

                else:
                    # print(f"Skipping example due to empty model response: {conversation_for_inference}") # Optional: uncomment for details
                    pass # Skip examples with no valid prediction

            except Exception as e:
                print(f"Error during inference or evaluation for example {i + 1}: {e}")
                # print(f"Example data: {conversation_for_inference}") # Optional: uncomment for details
                continue # Continue to the next example even if one fails

    # Calculate classification metrics
    if evaluation_count > 0:
        # Ensure labels are consistent strings before calculating metrics
        # You might need to map labels if they are not exactly "Relevant" and "Irrelevant" or similar
        # For example, if your labels are '0' and '1'
        # ground_truth_labels = [str(label) for label in ground_truth_labels]
        # predicted_labels = [str(label) for label in predicted_labels]

        # Simple Accuracy
        accuracy = accuracy_score(ground_truth_labels, predicted_labels)

        # Precision, Recall, F1 (requires specifying positive label for binary)
        # Assuming 'Relevant' is the positive class, adjust if needed
        # You might need error_on_invalid = 'ignore' or similar depending on labels present
        try:
            precision = precision_score(ground_truth_labels, predicted_labels, pos_label='Relevant', average='binary', zero_division=0)
            recall = recall_score(ground_truth_labels, predicted_labels, pos_label='Relevant', average='binary', zero_division=0)
            f1 = f1_score(ground_truth_labels, predicted_labels, pos_label='Relevant', average='binary', zero_division=0)
        except ValueError as e:
            print(f"Could not calculate precision, recall, F1. Check labels: {e}")
            print(f"Sample Ground Truth Labels: {ground_truth_labels[:10]}")
            print(f"Sample Predicted Labels: {predicted_labels[:10]}")
            precision, recall, f1 = None, None, None


        print(f"\n--- Classification Evaluation Results ---")
        print(f"Evaluated on {evaluation_count} examples.")
        print(f"Accuracy: {accuracy:.4f}")
        if precision is not None:
            print(f"Precision (Relevant): {precision:.4f}")
            print(f"Recall (Relevant): {recall:.4f}")
            print(f"F1 Score (Relevant): {f1:.4f}")
        print(f"---------------------------------------")
    else:
        print("\nNo examples were successfully evaluated for classification.")


Loaded 62150 examples from /content/openai_finetuning_data_classification_chat_format/test_classification_chat_format.jsonl

Evaluating fine-tuned classification model 'ft:gpt-4.1-nano-2025-04-14:personal::BzIXr0Db' on the test set...
Processed 100/62150 examples...
Processed 200/62150 examples...
Processed 300/62150 examples...
Processed 400/62150 examples...
Processed 500/62150 examples...
Processed 600/62150 examples...
Processed 700/62150 examples...
Processed 800/62150 examples...
Processed 900/62150 examples...
Processed 1000/62150 examples...
Processed 1100/62150 examples...
Processed 1200/62150 examples...
Processed 1300/62150 examples...
Processed 1400/62150 examples...
Processed 1500/62150 examples...
Processed 1600/62150 examples...
Processed 1700/62150 examples...
Processed 1800/62150 examples...
Processed 1900/62150 examples...
Processed 2000/62150 examples...
Processed 2100/62150 examples...
Processed 2200/62150 examples...
Processed 2300/62150 examples...
Processed 2400/

# Task
Implement a Retrieval Augmented Generation (RAG) system using the policy documents located in "PLUE/PLUE-main/data/policyqa" as the knowledge base. The system should be able to answer questions based on the content of these documents.

## Load and prepare documents

### Subtask:
Load the policy documents from the specified directory and extract the text content.


**Reasoning**:
The goal is to extract the text content from the policy documents. The data is already loaded as a nested dictionary in the `data` variable. I need to iterate through the nested structure to get the 'context' from each paragraph and store it in a list.



In [None]:
# Initialize a list to store the extracted document text
document_texts = []

# Iterate through the 'data' list in the loaded JSON
nested_data = data.get('data', [])
for item in nested_data:
    paragraphs = item.get('paragraphs', [])
    for paragraph in paragraphs:
        context = paragraph.get('context', '')
        if context: # Only add if context is not empty
            document_texts.append(context)

# Print some information to verify
print(f"Extracted text from {len(document_texts)} paragraphs.")
if document_texts:
    print("\nSample of the first extracted text:")
    print(document_texts[0][:500] + "...") # Print the first 500 characters of the first document

Extracted text from 2189 paragraphs.

Sample of the first extracted text:
INFORMATION WE COLLECT ABOUT YOU When you interact with SIDEARM Services, we collect: (1) personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and (2) non-personal information transmitted through technology, including tracking information, which is also collected by third parties....


## Index documents

### Subtask:
Index your policy document text for efficient retrieval. This involves splitting documents into smaller, manageable chunks and creating numerical representations (embeddings) for these chunks.


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# 1. Split the document texts into smaller chunks
# Using RecursiveCharacterTextSplitter with a chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Adjust based on what seems like a relevant chunk size
    chunk_overlap=100 # Adjust overlap to maintain context between chunks
)

document_chunks = text_splitter.create_documents(document_texts)

print(f"Created {len(document_chunks)} chunks from the policy documents.")
if document_chunks:
    print("\nSample of the first document chunk:")
    print(document_chunks[0].page_content[:500] + "...") # Print the first 500 characters of the first chunk

# 2. Generate embeddings for each document chunk
# Using a pre-trained sentence-transformer model
# Make sure to have 'sentence-transformers' installed
try:
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # A good general-purpose model
    print(f"\nLoaded embedding model: {embedding_model.model_name}")

    # Generate embeddings for all chunks
    chunk_texts = [chunk.page_content for chunk in document_chunks]
    chunk_embeddings = embedding_model.encode(chunk_texts)

    print(f"Generated embeddings for {len(chunk_embeddings)} chunks. Embedding dimension: {chunk_embeddings.shape[1]}")

    # 3. Store the document chunks and their embeddings
    # Storing in a simple list of dictionaries for now
    # In a real application, a dedicated vector database would be used
    indexed_documents = []
    for i, chunk in enumerate(document_chunks):
        indexed_documents.append({
            "text": chunk.page_content,
            "embedding": chunk_embeddings[i].tolist() # Convert numpy array to list for storage
        })

    print(f"Stored {len(indexed_documents)} document chunks and their embeddings.")

    # 4. Confirm embeddings have been generated and stored
    if indexed_documents and 'embedding' in indexed_documents[0]:
        print("\nConfirmed: Embeddings generated and stored successfully.")
    else:
        print("\nWarning: Embeddings might not have been generated or stored correctly.")

except ImportError:
    print("\nError: 'sentence-transformers' library not found. Please install it ('pip install sentence-transformers').")
except Exception as e:
    print(f"\nAn error occurred during embedding or storing: {e}")


Created 3252 chunks from the policy documents.

Sample of the first document chunk:
INFORMATION WE COLLECT ABOUT YOU When you interact with SIDEARM Services, we collect: (1) personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and (2) non-personal information transmitted through technology, including tracking information, which is also collected by third parties....


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


An error occurred during embedding or storing: 'SentenceTransformer' object has no attribute 'model_name'


**Reasoning**:
The previous step failed because the `SentenceTransformer` object does not have a `model_name` attribute. I need to fix the code to correctly print the loaded model information and proceed with generating and storing the embeddings.



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# 1. Split the document texts into smaller chunks
# Using RecursiveCharacterTextSplitter with a chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Adjust based on what seems like a relevant chunk size
    chunk_overlap=100 # Adjust overlap to maintain context between chunks
)

document_chunks = text_splitter.create_documents(document_texts)

print(f"Created {len(document_chunks)} chunks from the policy documents.")
if document_chunks:
    print("\nSample of the first document chunk:")
    print(document_chunks[0].page_content[:500] + "...") # Print the first 500 characters of the first chunk

# 2. Generate embeddings for each document chunk
# Using a pre-trained sentence-transformer model
# Make sure to have 'sentence-transformers' installed
try:
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2') # A good general-purpose model
    # Corrected: Access model name via ._load_model_name or similar if available, or just print a confirmation
    print("\nLoaded embedding model: all-MiniLM-L6-v2") # Directly print the model name

    # Generate embeddings for all chunks
    chunk_texts = [chunk.page_content for chunk in document_chunks]
    chunk_embeddings = embedding_model.encode(chunk_texts)

    print(f"Generated embeddings for {len(chunk_embeddings)} chunks. Embedding dimension: {chunk_embeddings.shape[1]}")

    # 3. Store the document chunks and their embeddings
    # Storing in a simple list of dictionaries for now
    # In a real application, a dedicated vector database would be used
    indexed_documents = []
    for i, chunk in enumerate(document_chunks):
        indexed_documents.append({
            "text": chunk.page_content,
            "embedding": chunk_embeddings[i].tolist() # Convert numpy array to list for storage
        })

    print(f"Stored {len(indexed_documents)} document chunks and their embeddings.")

    # 4. Confirm embeddings have been generated and stored
    if indexed_documents and 'embedding' in indexed_documents[0]:
        print("\nConfirmed: Embeddings generated and stored successfully.")
    else:
        print("\nWarning: Embeddings might not have been generated or stored correctly.")

except ImportError:
    print("\nError: 'sentence-transformers' library not found. Please install it ('pip install sentence-transformers').")
except Exception as e:
    print(f"\nAn error occurred during embedding or storing: {e}")

Created 3252 chunks from the policy documents.

Sample of the first document chunk:
INFORMATION WE COLLECT ABOUT YOU When you interact with SIDEARM Services, we collect: (1) personal information you supply when you subscribe, order, complete a form or survey, register for one of our Web Sites, enter a contest or provide your email address and (2) non-personal information transmitted through technology, including tracking information, which is also collected by third parties....

Loaded embedding model: all-MiniLM-L6-v2
Generated embeddings for 3252 chunks. Embedding dimension: 384
Stored 3252 document chunks and their embeddings.

Confirmed: Embeddings generated and stored successfully.


# Task
Implement a Retrieval Augmented Generation (RAG) system using the policy documents located in "PLUE/PLUE-main/data/policyqa" to answer questions. The system should load and index the documents, retrieve relevant information based on a user query, and use a language model to generate an answer. Evaluate the system's performance on the development set using F1 and Exact Match metrics.

## Load and prepare documents

### Subtask:
Load the policy documents from the specified directory and extract the text content.


## Evaluate rag system on QA

### Subtask:
Evaluate the performance of the RAG system on the development dataset using metrics like F1 and Exact Match.


**Reasoning**:
Load the development dataset, iterate through each example, use the RAG system to get predictions, calculate F1 and EM scores, and then compute and print the average scores.



In [None]:
import json
from collections import Counter
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import openai

# --- Re-defining functions from previous steps for completeness in this cell ---
# (Assuming necessary libraries like sentence_transformers and sklearn are already installed)

# Function to normalize text for F1 and EM calculation
def normalize_text(text):
    """Lowercases, removes punctuation and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Function to calculate F1 and Exact Match
def calculate_f1_em(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0, 0

    precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
    recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

    return f1, exact_match

# Function to retrieve relevant document chunks
def retrieve_relevant_chunks(query, indexed_documents, embedding_model, top_n=5):
    """
    Retrieves the most relevant document chunks for a given query.
    """
    query_embedding = embedding_model.encode([query])[0]
    document_embeddings = np.array([doc['embedding'] for doc in indexed_documents])
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    top_n_indices = similarities.argsort()[-top_n:][::-1]
    relevant_chunks = [indexed_documents[i] for i in top_n_indices]
    return relevant_chunks

# Function to generate an answer using the fine-tuned model
def generate_answer(query, retrieved_chunks, fine_tuned_qa_model_name):
    """
    Generates an answer to the user's query based on the retrieved document chunks
    using the fine-tuned OpenAI chat model.
    """
    context = "\n".join([chunk['text'] for chunk in retrieved_chunks])
    messages = []
    messages.append({"role": "system", "content": "You are a helpful assistant that answers questions accurately based ONLY on the provided legal policy context. If the answer is not found in the context, state that you cannot answer based on the provided information."})
    user_content = f"Context: {context}\n\nQuestion: {query}"
    messages.append({"role": "user", "content": user_content})

    try:
        response = openai.chat.completions.create(
            model=fine_tuned_qa_model_name,
            messages=messages,
            max_tokens=250,
            temperature=0.0
        )
        if response.choices and response.choices[0].message and response.choices[0].message.content:
            return response.choices[0].message.content.strip()
        else:
            return "Could not generate an answer from the model."
    except Exception as e:
        return f"An error occurred during answer generation: {e}"

# Function to run the end-to-end RAG system
def query_rag_system(user_query, indexed_documents, embedding_model, fine_tuned_qa_model_name, top_n=5):
    """
    Runs the end-to-end RAG system for a given user query.
    """
    retrieved_chunks = retrieve_relevant_chunks(user_query, indexed_documents, embedding_model, top_n=top_n)
    generated_answer = generate_answer(user_query, retrieved_chunks, fine_tuned_qa_model_name)
    return generated_answer

# --- Evaluation Logic ---

# Path to the development dataset
dev_file_path = '/content/openai_finetuning_data/dev.jsonl'

# Load the development data
dev_data = []
try:
    with open(dev_file_path, 'r') as f:
        for line in f:
            dev_data.append(json.loads(line))
    print(f"Loaded {len(dev_data)} examples from {dev_file_path}")
except FileNotFoundError:
    print(f"Error: Dev file not found at {dev_file_path}. Cannot perform evaluation.")
    dev_data = None
except Exception as e:
    print(f"An error occurred loading dev data: {e}")
    dev_data = None


if dev_data:
    total_f1 = 0
    total_em = 0
    evaluation_count = 0
    progress_interval = 100

    print(f"\nEvaluating RAG system on the dev set...")

    # Assuming indexed_documents, embedding_model, and fine_tuned_qa_model_name
    # are available from previous successful cells.
    # Make sure fine_tuned_qa_model_name is correctly set to your trained model name
    # fine_tuned_qa_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW" # Replace with your actual model name

    # Evaluate on all examples in the dev set
    data_to_evaluate = dev_data

    for i, example in enumerate(data_to_evaluate):
        messages = example.get('messages', [])
        if not messages:
            continue

        user_query = ""
        ground_truth_answer = ""

        # Extract user query and ground truth answer
        for msg in messages:
            if msg.get('role') == 'user':
                # Assuming the user message contains both context and question,
                # we need to parse the question part.
                user_content = msg.get('content', '')
                question_match = re.search(r'Question: (.*)', user_content)
                if question_match:
                    user_query = question_match.group(1).strip()
            elif msg.get('role') == 'assistant':
                ground_truth_answer = msg.get('content', '').strip()

        if user_query and ground_truth_answer:
            try:
                # Get the generated answer from the RAG system
                # Use a lower top_n for retrieval during evaluation if needed for speed,
                # but keeping consistent with previous retrieval test (top_n=3 or 5) is better.
                generated_answer = query_rag_system(user_query, indexed_documents, embedding_model, fine_tuned_qa_model_name, top_n=3)

                # Calculate metrics
                f1, em = calculate_f1_em(generated_answer, ground_truth_answer)

                total_f1 += f1
                total_em += em
                evaluation_count += 1

                # Print progress
                if (i + 1) % progress_interval == 0:
                    print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")

            except Exception as e:
                print(f"Error during RAG processing or evaluation for example {i + 1}: {e}")
                continue # Continue to the next example

    # Calculate average metrics
    if evaluation_count > 0:
        average_f1 = total_f1 / evaluation_count
        average_em = total_em / evaluation_count
        print(f"\n--- RAG System Evaluation Results ---")
        print(f"Evaluated on {evaluation_count} examples.")
        print(f"Average F1 Score: {average_f1:.4f}")
        print(f"Average Exact Match Score: {average_em:.4f}")
        print(f"------------------------------------")
    else:
        print("\nNo examples were successfully evaluated by the RAG system.")


Loaded 3809 examples from /content/openai_finetuning_data/dev.jsonl

Evaluating RAG system on the dev set...
Processed 100/3809 examples...
Processed 200/3809 examples...
Processed 300/3809 examples...
Processed 400/3809 examples...
Processed 500/3809 examples...
Processed 600/3809 examples...
Processed 700/3809 examples...
Processed 800/3809 examples...
Processed 900/3809 examples...
Processed 1000/3809 examples...
Processed 1100/3809 examples...
Processed 1200/3809 examples...
Processed 1300/3809 examples...
Processed 1400/3809 examples...
Processed 1500/3809 examples...
Processed 1600/3809 examples...
Processed 1700/3809 examples...
Processed 1800/3809 examples...
Processed 1900/3809 examples...
Processed 2000/3809 examples...
Processed 2100/3809 examples...
Processed 2200/3809 examples...
Processed 2300/3809 examples...
Processed 2400/3809 examples...
Processed 2500/3809 examples...
Processed 2600/3809 examples...
Processed 2700/3809 examples...
Processed 2800/3809 examples...
Proc

## Summary:

### Data Analysis Key Findings

*   The RAG system was evaluated on 3809 examples from the development dataset.
*   The evaluation resulted in an average F1 score of 0.1334.
*   The evaluation resulted in an average Exact Match score of 0.0081.

### Insights or Next Steps

*   The low F1 and Exact Match scores suggest that the RAG system's performance in accurately answering questions from the policy documents needs significant improvement.
*   Further analysis is required to identify the root causes of the low scores, potentially focusing on improving document retrieval relevance, enhancing the language model's ability to synthesize information from the retrieved context, or refining the prompt engineering for answer generation.


MAUDE

In [None]:
import json
import os

# Assuming the maud_squad files are in a directory like PLUE/PLUE-main/data/maud_squad
# Please adjust this path if your files are located elsewhere
data_dir = '.' # *** ADJUST THIS PATH IF NEEDED ***

train_file_path = os.path.join(data_dir, 'maud_squad_train.json')

try:
    with open(train_file_path, 'r') as f:
        data = json.load(f)

    print(f"Successfully loaded data from {train_file_path}")

    # Print the keys at the top level to understand the structure
    print("\nTop-level keys in the JSON:")
    print(list(data.keys()))

    # Assuming a SQuAD-like structure, the data is often under a key like 'data' or similar,
    # containing a list of articles, each with paragraphs and qas.
    # Let's try to access a common structure and print a sample.
    if 'data' in data and isinstance(data['data'], list) and len(data['data']) > 0:
        print("\nStructure and sample of the first item in 'data':")
        # Print the full first item to see its structure
        print(json.dumps(data['data'][0], indent=2))

        # Optionally, print a sample question/answer from within the structure
        # Assuming data -> [article] -> paragraphs -> [paragraph] -> qas -> [qa]
        if 'paragraphs' in data['data'][0] and isinstance(data['data'][0]['paragraphs'], list) and len(data['data'][0]['paragraphs']) > 0:
            if 'qas' in data['data'][0]['paragraphs'][0] and isinstance(data['data'][0]['paragraphs'][0]['qas'], list) and len(data['data'][0]['paragraphs'][0]['qas']) > 0:
                 print("\nSample of the first QA pair:")
                 print(json.dumps(data['data'][0]['paragraphs'][0]['qas'][0], indent=2))

    elif 'data' in data and not isinstance(data['data'], list):
         print("\n'data' key found, but its value is not a list. Structure might be different.")
         # Print a sample of the non-list data if it's not too large
         # print(json.dumps(data['data'], indent=2))

    else:
        print("\n'data' key not found or is empty. Top-level keys might be different or data is structured differently.")


except FileNotFoundError:
    print(f"Error: File not found at {train_file_path}. Please check the data_dir path.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {train_file_path}. Please check the file content.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded data from .\maud_squad_train.json

Top-level keys in the JSON:
['version', 'data']

Structure and sample of the first item in 'data':
{
  "title": "contract_60",
  "paragraphs": [
    {
      "qas": [
        {
          "question": "Highlight the parts of the text if any related to \"Absence of Litigation Closing Condition\" that should be reviewed by a lawyer.",
          "is_impossible": true,
          "answers": [],
          "id": "contract_60_Absence of Litigation Closing Condition"
        },
        {
          "question": "Highlight the parts of the text if any related to \"Accuracy of Target R&W Closing Condition\" that should be reviewed by a lawyer.",
          "is_impossible": false,
          "answers": [
            {
              "text": "representation and warranty speaks as of a particular date, in which case such representation and warranty shall be true and correct, subject only to de minimis inaccuracies, as of such earlier date), (ii) the rep

In [None]:
import json
import os

# Assuming the maud_squad files are in this directory
data_dir = '.' # Ensure this path is correct and accessible
output_dir = '/content/openai_fine-tuning_data_maud_squad_qa' # New directory for MAUD SQuAD QA data
os.makedirs(output_dir, exist_ok=True)

train_file_path = os.path.join(data_dir, 'maud_squad_train.json')
dev_file_path = os.path.join(data_dir, 'maud_squad_dev.json')
test_file_path = os.path.join(data_dir, 'maud_squad_test.json')


def format_maud_squad_for_openai_qa(input_path, output_path, max_examples=None):
    """Loads MAUD SQuAD JSON, formats it for OpenAI chat model fine-tuning (QA),
       and optionally limits the number of examples."""
    print(f"Loading data from: {input_path}")
    try:
        with open(input_path, 'r') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {input_path}. Skipping formatting.")
        return
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {input_path}. Please check the file content.")
        return
    except Exception as e:
        print(f"An unexpected error occurred loading {input_path}: {e}. Skipping formatting.")
        return

    nested_data = data.get('data', [])
    print(f"Processing {len(nested_data)} articles from {input_path}")

    processed_count = 0
    skipped_count = 0

    with open(output_path, 'w') as outfile:
        # Iterate through articles, paragraphs, and qas
        for article in nested_data:
            for paragraph in article.get('paragraphs', []):
                context = paragraph.get('context', '')
                for qa in paragraph.get('qas', []):
                    # Stop processing if max_examples is reached
                    if max_examples is not None and processed_count >= max_examples:
                        print(f"Max examples ({max_examples}) reached for {input_path}. Stopping processing.")
                        break # Break from qa loop

                    question = qa.get('question', '')
                    answers = qa.get('answers', [])
                    is_impossible = qa.get('is_impossible', False)

                    # Skip impossible questions for this fine-tuning approach
                    if is_impossible:
                        # print(f"Skipping impossible question: {question}") # Optional: uncomment for details
                        skipped_count += 1
                        continue

                    # Extract answer text(s) - assuming 'answers' is a list of dicts with 'text'
                    answer_texts = [ans.get('text', '') for ans in answers if ans.get('text')]

                    # Skip examples with no answers (should be covered by is_impossible, but good check)
                    if not answer_texts:
                        # print(f"Skipping question with no answers: {question}") # Optional: uncomment for details
                        skipped_count += 1
                        continue

                    # Combine answers if there are multiple (common in SQuAD format)
                    # You might adjust how multiple answers are presented
                    assistant_content = " ".join(answer_texts)

                    # Create messages list in chat format
                    messages = []

                    # Add a system message for the MAUD SQuAD QA task (customize as needed)
                    messages.append({"role": "system", "content": "You are an expert in legal documents and answer questions based on the provided context."})

                    # User message: Combine context and question
                    user_content = f"Context: {context}\n\nQuestion: {question}"
                    messages.append({"role": "user", "content": user_content})

                    # Assistant message: Provide the correct answer(s)
                    messages.append({"role": "assistant", "content": assistant_content})

                    # Write the chat formatted example
                    openai_format = {"messages": messages}
                    outfile.write(json.dumps(openai_format) + '\n')
                    processed_count += 1

                if max_examples is not None and processed_count >= max_examples:
                    break # Break from paragraph loop

            if max_examples is not None and processed_count >= max_examples:
                break # Break from article loop

    print(f"Finished formatting {output_path}. Processed {processed_count} QA pairs, skipped {skipped_count}.")


# Format train, dev, and test files with a limit on the number of examples
# You can adjust the max_examples value as needed to get a file size that is accepted by OpenAI
# For example, try 1000, 5000, or 10000 examples initially.
# You might need to experiment to find a suitable size.
# Setting max_examples=None will process all examples (original behavior).

max_train_examples = 100 # *** ADJUST THIS VALUE ***
max_dev_examples = 100 # *** ADJUST THIS VALUE ***
max_test_examples = 100 # *** ADJUST THIS VALUE ***


format_maud_squad_for_openai_qa(train_file_path, os.path.join(output_dir, 'train_maud_squad_qa_chat_format.jsonl'), max_examples=max_train_examples)
print("-" * 30)
format_maud_squad_for_openai_qa(dev_file_path, os.path.join(output_dir, 'dev_maud_squad_qa_chat_format.jsonl'), max_examples=max_dev_examples)
print("-" * 30)
format_maud_squad_for_openai_qa(test_file_path, os.path.join(output_dir, 'test_maud_squad_qa_chat_format.jsonl'), max_examples=max_test_examples)


print(f"\nMAUD SQuAD QA data processing complete. Files saved to {output_dir}")

Loading data from: .\maud_squad_train.json
Processing 120 articles from .\maud_squad_train.json
Max examples (100) reached for .\maud_squad_train.json. Stopping processing.
Finished formatting /content/openai_fine-tuning_data_maud_squad_qa\train_maud_squad_qa_chat_format.jsonl. Processed 100 QA pairs, skipped 23.
------------------------------
Loading data from: .\maud_squad_dev.json
Processing 16 articles from .\maud_squad_dev.json
Max examples (100) reached for .\maud_squad_dev.json. Stopping processing.
Finished formatting /content/openai_fine-tuning_data_maud_squad_qa\dev_maud_squad_qa_chat_format.jsonl. Processed 100 QA pairs, skipped 13.
------------------------------
Loading data from: .\maud_squad_test.json
Processing 16 articles from .\maud_squad_test.json
Max examples (100) reached for .\maud_squad_test.json. Stopping processing.
Finished formatting /content/openai_fine-tuning_data_maud_squad_qa\test_maud_squad_qa_chat_format.jsonl. Processed 100 QA pairs, skipped 18.

MAUD S

In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell
openai.api_key = api_key
# Define the paths to your newly prepared MAUD QA data files (chat format, subset)
train_maud_qa_file_path = '/content/openai_fine-tuning_data_maud_squad_qa/train_maud_squad_qa_chat_format.jsonl'
dev_maud_qa_file_path = '/content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl'
test_maud_qa_file_path = '/content/openai_fine-tuning_data_maud_squad_qa/test_maud_squad_qa_chat_format.jsonl'


# Upload the training file for MAUD QA
print(f"Uploading MAUD QA training file: {train_maud_qa_file_path}")
train_maud_qa_upload_response = openai.files.create(
  file=open(train_maud_qa_file_path, "rb"),
  purpose="fine-tune"
)
train_maud_qa_file_id = train_maud_qa_upload_response.id
print(f"MAUD QA training file uploaded successfully with ID: {train_maud_qa_file_id}")


# Upload the dev file for MAUD QA (optional, good for validation)
print(f"Uploading MAUD QA dev file: {dev_maud_qa_file_path}")
dev_maud_qa_upload_response = openai.files.create(
  file=open(dev_maud_qa_file_path, "rb"),
  purpose="fine-tune" # Can also be 'eval' if supported for the model
)
dev_maud_qa_file_id = dev_maud_qa_upload_response.id
print(f"MAUD QA dev file uploaded successfully with ID: {dev_maud_qa_file_id}")


# Upload the test file for MAUD QA (optional)
print(f"Uploading MAUD QA test file: {test_maud_qa_file_path}")
test_maud_qa_upload_response = openai.files.create(
  file=open(test_maud_qa_file_path, "rb"),
  purpose="fine-tune" # Can also be 'eval' if supported for the model
)
test_maud_qa_file_id = test_maud_qa_upload_response.id
print(f"MAUD QA test file uploaded successfully with ID: {test_maud_qa_file_id}")


# You can now use the new file_id(s) in the next step to initiate fine-tuning
print("\nNew File IDs for MAUD QA fine-tuning (subset):")
print(f"Train MAUD QA file ID: {train_maud_qa_file_id}")
print(f"Dev MAUD QA file ID: {dev_maud_qa_file_id}")
print(f"Test MAUD QA file ID: {test_maud_qa_file_id}")

Uploading MAUD QA training file: /content/openai_fine-tuning_data_maud_squad_qa/train_maud_squad_qa_chat_format.jsonl
MAUD QA training file uploaded successfully with ID: file-HFW6R6DAMUEz3d6QW7Yy6U
Uploading MAUD QA dev file: /content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl
MAUD QA dev file uploaded successfully with ID: file-3fQfuiHJqzutD4w88pWuHN
Uploading MAUD QA test file: /content/openai_fine-tuning_data_maud_squad_qa/test_maud_squad_qa_chat_format.jsonl
MAUD QA test file uploaded successfully with ID: file-CevTTzAZciVbFFQHxx7gi5

New File IDs for MAUD QA fine-tuning (subset):
Train MAUD QA file ID: file-HFW6R6DAMUEz3d6QW7Yy6U
Dev MAUD QA file ID: file-3fQfuiHJqzutD4w88pWuHN
Test MAUD QA file ID: file-CevTTzAZciVbFFQHxx7gi5


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.g., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Your NEW file IDs for MAUD QA fine-tuning (subset)
train_maud_qa_file_id = "file-HFW6R6DAMUEz3d6QW7Yy6U"
dev_maud_qa_file_id = "file-3fQfuiHJqzutD4w88pWuHN" # Optional, remove if not using dev set for validation

print(f"Initiating MAUD QA fine-tuning job with training file ID: {train_maud_qa_file_id}")
if dev_maud_qa_file_id: # Check if dev_maud_qa_file_id is not empty or None
    print(f"Using validation file ID: {dev_maud_qa_file_id}")

try:
    fine_tuning_job_maud_qa = openai.fine_tuning.jobs.create(
      # Choose a supported chat model (e.g., gpt-3.5-turbo, gpt-4.1-nano-2025-04-14 if supported for FT)
      model="gpt-4.1-nano-2025-04-14",
      training_file=train_maud_qa_file_id,
      validation_file=dev_maud_qa_file_id # Optional: include if you uploaded a validation file
    )

    print(f"MAUD QA fine-tuning job created successfully!")
    print(f"Job ID: {fine_tuning_job_maud_qa.id}")
    print(f"Status: {fine_tuning_job_maud_qa.status}")

    # You can use the Job ID to monitor the status or cancel the job later.
    print("\nTo monitor the MAUD QA job status, you can use the following code (make sure API key is set):")
    print(f"# job_id_maud_qa = \"{fine_tuning_job_maud_qa.id}\"")
    print("# # Then use the monitoring code from earlier")

except Exception as e:
    print(f"Error initiating MAUD QA fine-tuning job: {e}")

Initiating MAUD QA fine-tuning job with training file ID: file-HFW6R6DAMUEz3d6QW7Yy6U
Using validation file ID: file-3fQfuiHJqzutD4w88pWuHN
MAUD QA fine-tuning job created successfully!
Job ID: ftjob-06O41WmqlFz4N0mKJieEN4CZ
Status: validating_files

To monitor the MAUD QA job status, you can use the following code (make sure API key is set):
# job_id_maud_qa = "ftjob-06O41WmqlFz4N0mKJieEN4CZ"
# # Then use the monitoring code from earlier


In [None]:
import openai
import os

# Ensure your API key is set before running this cell, e.e., using an environment variable
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Replace with your MAUD QA fine-tuning job ID
job_id_maud_qa = "ftjob-06O41WmqlFz4N0mKJieEN4CZ"

print(f"Checking status for MAUD QA fine-tuning job: {job_id_maud_qa}")

try:
    fine_tuning_job_status = openai.fine_tuning.jobs.retrieve(job_id_maud_qa)

    print(f"Job ID: {fine_tuning_job_status.id}")
    print(f"Status: {fine_tuning_job_status.status}")
    if fine_tuning_job_status.fine_tuned_model:
         print(f"Fine-tuned model (if succeeded): {fine_tuning_job_status.fine_tuned_model}")
    if fine_tuning_job_status.error:
        print("\nError Details (if failed):")
        print(f"  Code: {fine_tuning_job_status.error.code}")
        print(f"  Message: {fine_tuning_job_status.error.message}")
        # print(f"  Param: {fine_tuning_job_status.error.param}") # Uncomment if needed

except Exception as e:
    print(f"Error retrieving job status: {e}")

Checking status for MAUD QA fine-tuning job: ftjob-06O41WmqlFz4N0mKJieEN4CZ
Job ID: ftjob-06O41WmqlFz4N0mKJieEN4CZ
Status: validating_files

Error Details (if failed):
  Code: None
  Message: None


In [None]:
import openai
import os
import json
from collections import Counter
import re

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY')

# Path to the MAUD development dataset (ensure this path is correct)
dev_maud_qa_file_path = '/content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl'

# Base model to use for baseline evaluation
base_model_name = "gpt-3.5-turbo" # Or another suitable base model

# Function to normalize text for F1 and EM calculation (reusing from previous evaluations)
def normalize_text(text):
    """Lowercases, removes punctuation and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Function to calculate F1 and Exact Match (reusing from previous evaluations)
def calculate_f1_em(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0, 0

    precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
    recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

    return f1, exact_match

# Load the MAUD development data
dev_maud_qa_data = []
try:
    with open(dev_maud_qa_file_path, 'r') as f:
        for line in f:
            dev_maud_qa_data.append(json.loads(line))
    print(f"Loaded {len(dev_maud_qa_data)} examples from {dev_maud_qa_file_path}")
except FileNotFoundError:
    print(f"Error: MAUD dev file not found at {dev_maud_qa_file_path}. Cannot perform evaluation.")
    dev_maud_qa_data = None
except Exception as e:
    print(f"An error occurred loading MAUD dev data: {e}")
    dev_maud_qa_data = None


if dev_maud_qa_data:
    total_f1 = 0
    total_em = 0
    evaluation_count = 0
    progress_interval = 100 # Print progress every 100 examples


    print(f"\nEvaluating base model '{base_model_name}' on MAUD dev set using F1 and EM...")

    # Note: Evaluating large datasets can be time-consuming and incur API costs.
    # Consider evaluating on a smaller subset first if needed.
    # For demonstration, let's evaluate on the first 10 examples:
    # data_to_evaluate = dev_maud_qa_data[:10]
    # print(f"Evaluating on a subset of {len(data_to_evaluate)} examples.")
    data_to_evaluate = dev_maud_qa_data


    for i, example in enumerate(data_to_evaluate):
        messages = example.get('messages', [])
        if not messages:
            # print(f"Skipping example with no messages: {example}") # Optional: uncomment for detailed skipping
            continue

        # The last message in the data is the assistant's response (the ground truth answer)
        # For inference with the base model, we send messages up to the user's turn
        conversation_for_inference = [msg for msg in messages if msg.get('role') != 'assistant']

        # Extract the ground truth answer from the assistant's message
        ground_truth_answer = ""
        for msg in messages:
            if msg.get('role') == 'assistant':
                ground_truth_answer = msg.get('content', '').strip()
                break # Assuming only one assistant turn at the end


        if conversation_for_inference and ground_truth_answer:
            try:
                # Use the base model for inference
                response = openai.chat.completions.create(
                    model=base_model_name,
                    messages=conversation_for_inference,
                    max_tokens=150 # Adjust max_tokens as needed for typical answer length
                )

                # Extract the model's predicted answer
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    predicted_answer = response.choices[0].message.content.strip()

                    # Calculate metrics
                    f1, em = calculate_f1_em(predicted_answer, ground_truth_answer)

                    total_f1 += f1
                    total_em += em
                    evaluation_count += 1

                    # Print progress
                    if (i + 1) % progress_interval == 0:
                        print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")

                    # Optional: Print individual results
                    # print(f"\nUser: {conversation_for_inference}")
                    # print(f"Ground Truth: {ground_truth_answer[:100]}...")
                    # print(f"Prediction: {predicted_answer[:100]}...")
                    # print(f"F1: {f1:.4f}, EM: {em:.4f}")

                else:
                    # print(f"Skipping example due to empty model response: {conversation_for_inference}") # Optional: uncomment for details
                    pass # Skip examples with no valid prediction

            except Exception as e:
                print(f"Error during inference or evaluation for example {i + 1}: {e}")
                # print(f"Example data: {conversation_for_inference}") # Optional: uncomment for details
                continue # Continue to the next example even if one fails

    # Calculate average metrics
    if evaluation_count > 0:
        average_f1 = total_f1 / evaluation_count
        average_em = total_em / evaluation_count
        print(f"\n--- Base Model MAUD QA Baseline Results ---")
        print(f"Evaluated on {evaluation_count} examples.")
        print(f"Average F1 Score: {average_f1:.4f}")
        print(f"Average Exact Match Score: {average_em:.4f}")
        print(f"-----------------------------------------")
    else:
        print("\nNo examples were successfully evaluated for baseline.")

In [None]:
import json
import os

# Assuming the maud_squad files are in this directory
data_dir = '.' # Ensure this path is correct and accessible

train_file_path = os.path.join(data_dir, 'maud_squad_train.json')
dev_file_path = os.path.join(data_dir, 'maud_squad_dev.json')
test_file_path = os.path.join(data_dir, 'maud_squad_test.json')

all_contexts = []

def extract_contexts_from_squad_like_json(file_path):
    """Loads a SQuAD-like JSON file and extracts all context strings."""
    contexts = []
    print(f"Extracting contexts from: {file_path}")
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)

        nested_data = data.get('data', [])
        print(f"Processing {len(nested_data)} articles...")

        for article in nested_data:
            for paragraph in article.get('paragraphs', []):
                context = paragraph.get('context', '')
                if context: # Add context only if it's not empty
                    contexts.append(context)

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Skipping context extraction.")
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {file_path}. Skipping context extraction.")
    except Exception as e:
        print(f"An unexpected error occurred processing {file_path}: {e}. Skipping context extraction.")

    print(f"Extracted {len(contexts)} contexts from {file_path}.")
    return contexts

# Extract contexts from all three files
all_contexts.extend(extract_contexts_from_squad_like_json(train_file_path))
print("-" * 30)
all_contexts.extend(extract_contexts_from_squad_like_json(dev_file_path))
print("-" * 30)
all_contexts.extend(extract_contexts_from_squad_like_json(test_file_path))

print(f"\nTotal contexts extracted from all files: {len(all_contexts)}")

# You now have a list of all context strings in 'all_contexts'
# This list will be used to build the searchable index in the next step.

Extracting contexts from: .\maud_squad_train.json
Processing 120 articles...
Extracted 120 contexts from .\maud_squad_train.json.
------------------------------
Extracting contexts from: .\maud_squad_dev.json
Processing 16 articles...
Extracted 16 contexts from .\maud_squad_dev.json.
------------------------------
Extracting contexts from: .\maud_squad_test.json
Processing 16 articles...
Extracted 16 contexts from .\maud_squad_test.json.

Total contexts extracted from all files: 152


Now let's split the documents into chunks and generate embeddings.

In [None]:
# Install necessary libraries for text splitting and embeddings
# %pip install langchain openai tiktoken

import os
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings # Using langchain's integration with OpenAI
# Import the Document class
from langchain_core.documents import Document

# Assume 'all_contexts' list contains the extracted context strings from the previous step
# Ensure your API key is set globally or in a previous cell before running this

# Retrieve API key value using os.getenv

# Check if the API key value was successfully retrieved
if not api_key_value:
    print("Error: OpenAI API key environment variable not set. Cannot proceed.")
    # Exit or handle error appropriately if key is not found
    # For this example, we'll print an error and might fail later
    indexed_chunks = None # Ensure indexed_chunks is None if key is missing
else:
    # Set the global openai api_key as a fallback/standard practice
    openai.api_key = api_key_value

    # 1. Split the contexts into smaller chunks
    # You can adjust chunk_size and chunk_overlap based on your text characteristics and model context window
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, # Adjust as needed
        chunk_overlap=200, # Adjust as needed
        length_function=len,
        is_separator_regex=False,
    )

    # Create Langchain Document objects from the contexts
    if 'all_contexts' not in locals() or not all_contexts:
        print("Error: 'all_contexts' list not found or is empty. Please run the previous cell (d22dcf36) first.")
        indexed_chunks = None # Ensure indexed_chunks is None if all_contexts is missing
    else:
        print(f"Creating Langchain Document objects from {len(all_contexts)} contexts...")
        docs = [Document(page_content=context, metadata={"source": f"context_{i}"}) for i, context in enumerate(all_contexts)]
        print(f"Created {len(docs)} Document objects.")


        # Split the documents into chunks
        print(f"Splitting {len(docs)} documents into chunks...")
        chunks = text_splitter.split_documents(docs)
        print(f"Created {len(chunks)} chunks.")

        # 2. Generate embeddings for each chunk
        # Choose an OpenAI embedding model
        # text-embedding-ada-002 is a common choice
        # Check OpenAI docs for newer models if available

        try:
            # Initialize the embedding model, EXPLICITLY PASSING the api_key
            embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key_value)

            print("Generating embeddings for chunks...")

            # Extract chunk texts for batch embedding
            chunk_texts = [chunk.page_content for chunk in chunks]

            # Generate embeddings using the embedding model
            # Langchain's embed_documents is efficient for batching
            chunk_embeddings = embedding_model.embed_documents(chunk_texts)


            # Storing chunks and embeddings together
            # We'll store the original chunk text, its embedding, and its metadata
            indexed_chunks = []
            for i, chunk in enumerate(chunks):
                 indexed_chunks.append({
                     "text": chunk.page_content,
                     "embedding": chunk_embeddings[i],
                     "metadata": chunk.metadata
                 })


            print(f"\nGenerated embeddings for all {len(indexed_chunks)} chunks.")

            # You now have 'indexed_chunks' as a list of dictionaries in memory.
            # This serves as our simple in-memory index.
            print("\nIndexing complete. Ready for retrieval.")

        except Exception as e:
            print(f"Error generating embeddings: {e}")
            print("Please ensure your OpenAI API key is set correctly and you have access to the embedding model.")
            indexed_chunks = None # Indicate that indexing failed

Creating Langchain Document objects from 152 contexts...
Created 152 Document objects.
Splitting 152 documents into chunks...
Created 82696 chunks.
Generating embeddings for chunks...

Generated embeddings for all 82696 chunks.

Indexing complete. Ready for retrieval.


In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os
import openai
from langchain_openai import OpenAIEmbeddings

# Ensure your API key is set (for OpenAI embeddings)
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Assume 'indexed_chunks' list is available from the previous indexing step (cell e05224ca)
if 'indexed_chunks' not in locals() or not indexed_chunks:
    print("Error: 'indexed_chunks' list not found or is empty. Please run the previous indexing cell (e05224ca) first.")
else:
    # Initialize the same embedding model used for indexing
    try:
        # Retrieve API key value using os.getenv
        api_key_value = ''

        # Check if the API key value was successfully retrieved
        if not api_key_value:
            print("Error: OpenAI API key environment variable not set. Cannot initialize embedding model.")
            embedding_model = None # Indicate failure
        else:
             # Initialize the embedding model, EXPLICITLY PASSING the api_key
            embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=api_key_value)
            print("Embedding model initialized successfully.")

    except Exception as e:
        print(f"Error initializing embedding model: {e}")
        print("Please ensure your OpenAI API key is set correctly and you have access to the embedding model.")
        embedding_model = None # Indicate failure


    if embedding_model:
        def retrieve_relevant_chunks(query, top_k=3):
            """Generates query embedding and finds top_k most similar chunks."""
            print(f"Retrieving relevant chunks for query: '{query[:50]}...'")
            try:
                # Generate embedding for the query
                query_embedding = embedding_model.embed_query(query)

                # Extract chunk embeddings from our indexed_chunks
                chunk_embeddings = [item['embedding'] for item in indexed_chunks]

                # Calculate cosine similarity between query embedding and all chunk embeddings
                # Reshape query_embedding for cosine_similarity function
                query_embedding_reshaped = np.array(query_embedding).reshape(1, -1)
                chunk_embeddings_array = np.array(chunk_embeddings)

                # Calculate similarity scores
                similarity_scores = cosine_similarity(query_embedding_reshaped, chunk_embeddings_array)[0]

                # Get the indices of the top_k most similar chunks
                top_k_indices = np.argsort(similarity_scores)[::-1][:top_k]

                # Retrieve the actual top_k chunks based on indices
                relevant_chunks = [indexed_chunks[i] for i in top_k_indices]

                print(f"Found {len(relevant_chunks)} relevant chunks.")
                return relevant_chunks

            except Exception as e:
                print(f"Error during retrieval: {e}")
                return [] # Return empty list if retrieval fails

        print("Retrieval function 'retrieve_relevant_chunks' is set up.")

        # You can test the retrieval function:
        # test_query = "What are the rules about data privacy?"
        # retrieved_chunks = retrieve_relevant_chunks(test_query, top_k=2)
        # print("\nTest Retrieval Results:")
        # for i, chunk in enumerate(retrieved_chunks):
        #     print(f"\nChunk {i+1} (Source: {chunk['metadata']['source']}):")
        #     print(chunk['text'][:200] + "...") # Print first 200 characters of the chunk text

Embedding model initialized successfully.
Retrieval function 'retrieve_relevant_chunks' is set up.


In [None]:
import openai
import os

# Ensure your API key is set (for OpenAI language model)
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Assume 'retrieve_relevant_chunks' function is available from the previous step (cell 3bd6e3f7)
if 'retrieve_relevant_chunks' not in locals():
    print("Error: 'retrieve_relevant_chunks' function not found. Please run the previous retrieval setup cell (3bd6e3f7) first.")
else:
    # Choose the Language Model for generation
    generation_model_name = "gpt-4.1-nano-2025-04-14" # Or another suitable chat model

    # Retrieve API key value using os.getenv
    api_key_value =

    # Check if the API key value was successfully retrieved
    if not api_key_value:
        print("Error: OpenAI API key environment variable not set. Cannot initialize language model.")
        # Ensure the generation function will report error if key is missing
        def generate_rag_answer(query):
            print("API key not set. Cannot generate RAG answer.")
            return "Error: API key not set."
    else:
        # Set the global openai api_key as a fallback/standard practice
        openai.api_key = api_key_value

        def generate_rag_answer(query):
            """Retrieves relevant chunks and generates an answer using a language model."""
            print(f"\nGenerating RAG answer for query: '{query[:50]}...'")

            # 1. Retrieve relevant chunks based on the query
            # Adjust top_k as needed to control how many chunks are sent to the LLM
            relevant_chunks = retrieve_relevant_chunks(query, top_k=3)

            if not relevant_chunks:
                print("No relevant chunks found. Cannot generate answer.")
                return "Could not find relevant information to answer the question."

            # 2. Construct the prompt for the language model
            # Combine retrieved chunk texts into a single context string
            retrieved_context = "\n\n".join([chunk['text'] for chunk in relevant_chunks])

            # Create the messages list for the chat model
            messages = []

            # Optional: Add a system message to guide the model's behavior
            messages.append({"role": "system", "content": "You are a legal expert answers questions based on the provided context. If the answer is not in the context, state that you cannot answer based on the provided information."})

            # User message: Include the retrieved context and the original question
            user_content = f"Context: {retrieved_context}\n\nQuestion: {query}"
            messages.append({"role": "user", "content": user_content})

            # 3. Use the language model to generate the answer
            try:
                response = openai.chat.completions.create(
                    model=generation_model_name,
                    messages=messages,
                    max_tokens=300, # Adjust max_tokens as needed for answer length
                    temperature=0.7 # Adjust temperature for creativity vs. determinism
                )

                # Extract the generated answer
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    generated_answer = response.choices[0].message.content.strip()
                    print("Answer generated successfully.")
                    return generated_answer
                else:
                    print("Language model returned an empty response.")
                    return "Could not generate an answer."

            except Exception as e:
                print(f"Error during language model generation: {e}")
                print("Please ensure your OpenAI API key is set correctly and you have access to the language model.")
                return "An error occurred while generating the answer."

    # Only print this if the setup potentially succeeded (API key found)
    if api_key_value:
        print("Generation function 'generate_rag_answer' is set up.")

    # You can test the RAG system end-to-end:
    # test_query = "What are the rules about data privacy?"
    # rag_answer = generate_rag_answer(test_query)
    # print("\n--- RAG System Answer ---")
    # print(rag_answer)
    # print("-------------------------")

Generation function 'generate_rag_answer' is set up.


In [None]:
# Assume 'generate_rag_answer' function is available from the previous step (cell 6fc1f143)
if 'generate_rag_answer' not in locals():
    print("Error: 'generate_rag_answer' function not found. Please run the previous generation setup cell (6fc1f143) first.")
else:
    # Enter your query here to test the RAG system
    user_query = "What is the purpose of this agreement?" # Replace with a question relevant to your MAUD data

    print(f"Testing RAG system with query: '{user_query}'")

    # Call the RAG generation function
    rag_response = generate_rag_answer(user_query)

    print("\n--- RAG System Answer ---")
    print(rag_response)
    print("-------------------------")

    # You can try other queries by changing the 'user_query' variable and re-running this cell.

Testing RAG system with query: 'What is the purpose of this agreement?'

Generating RAG answer for query: 'What is the purpose of this agreement?...'
Retrieving relevant chunks for query: 'What is the purpose of this agreement?...'
Found 3 relevant chunks.
Answer generated successfully.

--- RAG System Answer ---
The purpose of this agreement is to amend the TRA (Trade-Related Agreement) and establish the terms and conditions under which the Parties are legally bound to each other, including definitions, references, and the governing precedence between this agreement and the TRA.
-------------------------


In [None]:
import openai
import os
import json
from collections import Counter
import re

# Ensure your API key is set (for OpenAI language model calls within RAG)
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell


# Assume 'generate_rag_answer' function is available from the previous step (cell 6fc1f143)
if 'generate_rag_answer' not in locals():
    print("Error: 'generate_rag_answer' function not found. Please run the generation setup cell (6fc1f143) first.")
else:
    # Path to the MAUD development dataset (ensure this path is correct)
    dev_maud_qa_file_path = '/content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl'

    # Function to normalize text for F1 and EM calculation (reusing from previous evaluations)
    def normalize_text(text):
        """Lowercases, removes punctuation and extra whitespace."""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
        return text

    # Function to calculate F1 and Exact Match (reusing from previous evaluations)
    def calculate_f1_em(prediction, ground_truth):
        prediction_tokens = normalize_text(prediction).split()
        ground_truth_tokens = normalize_text(ground_truth).split()

        common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
        num_common = sum(common.values())

        if num_common == 0:
            return 0, 0

        precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
        recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

        return f1, exact_match

    # Load the MAUD development data
    dev_maud_qa_data = []
    try:
        with open(dev_maud_qa_file_path, 'r') as f:
            for line in f:
                dev_maud_qa_data.append(json.loads(line))
        print(f"Loaded {len(dev_maud_qa_data)} examples from {dev_maud_qa_file_path}")
    except FileNotFoundError:
        print(f"Error: MAUD dev file not found at {dev_maud_qa_file_path}. Cannot perform evaluation.")
        dev_maud_qa_data = None
    except Exception as e:
        print(f"An error occurred loading MAUD dev data: {e}")
        dev_maud_qa_data = None


    if dev_maud_qa_data:
        total_f1 = 0
        total_em = 0
        evaluation_count = 0
        progress_interval = 100 # Print progress every 100 examples


        print(f"\nEvaluating RAG system on MAUD dev set using F1 and EM...")

        # --- Evaluate on a subset ---
        subset_size = 10 # *** SET SUBSET SIZE HERE ***
        data_to_evaluate = dev_maud_qa_data[:subset_size]
        print(f"Evaluating on a subset of {len(data_to_evaluate)} examples.")
        # --- To evaluate on the full dataset, comment out the two lines above and uncomment the line below ---
        # data_to_evaluate = dev_maud_qa_data


        for i, example in enumerate(data_to_evaluate):
            messages = example.get('messages', [])
            if not messages:
                # print(f"Skipping example with no messages: {example}") # Optional: uncomment for detailed skipping
                continue

            # Extract the user message content (contains context and question)
            # and the ground truth answer from the assistant's message
            user_message_content = ""
            ground_truth_answer = ""

            for msg in messages:
                if msg.get('role') == 'user':
                    user_message_content = msg.get('content', '')
                elif msg.get('role') == 'assistant':
                    ground_truth_answer = msg.get('content', '').strip()
                    break # Assuming only one assistant turn at the end

            # Extract just the question from the user message content for the RAG query
            # Assuming the format is "Context: ...\n\nQuestion: ..."
            question_match = re.search(r"Question: (.*)", user_message_content, re.DOTALL)
            if question_match:
                query = question_match.group(1).strip()
            else:
                 # print(f"Skipping example {i+1} due to inability to extract question from user message.") # Optional: uncomment
                 continue # Skip if question cannot be extracted


            if query and ground_truth_answer:
                try:
                    # Use the RAG generation function to get the predicted answer
                    predicted_answer = generate_rag_answer(query)

                    # Calculate metrics
                    f1, em = calculate_f1_em(predicted_answer, ground_truth_answer)

                    total_f1 += f1
                    total_em += em
                    evaluation_count += 1

                    # Print progress - now uses the subset size
                    if (i + 1) % (len(data_to_evaluate) // 10 or 1) == 0: # Print 10 times evenly or at least once
                         print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")


                    # Optional: Print individual results (commented out for less verbosity)
                    # print(f"\nQuery: {query[:100]}...")
                    # print(f"Ground Truth: {ground_truth_answer[:100]}...")
                    # print(f"Prediction: {predicted_answer[:100]}...")
                    # print(f"F1: {f1:.4f}, EM: {em:.4f}")

                except Exception as e:
                    print(f"Error during RAG evaluation for example {i + 1} (Query: {query[:50]}...): {e}")
                    continue # Continue to the next example even if one fails

        # Calculate average metrics
        if evaluation_count > 0:
            average_f1 = total_f1 / evaluation_count
            average_em = total_em / evaluation_count
            print(f"\n--- RAG System Evaluation Results (Subset) ---")
            print(f"Evaluated on {evaluation_count} examples.")
            print(f"Average F1 Score: {average_f1:.4f}")
            print(f"Average Exact Match Score: {average_em:.4f}")
            print(f"--------------------------------------------")
        else:
            print("\nNo examples were successfully evaluated for RAG.")
    else:
         print("\nRAG evaluation could not be performed due to issues loading development data.")

Loaded 100 examples from /content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl

Evaluating RAG system on MAUD dev set using F1 and EM...
Evaluating on a subset of 10 examples.

Generating RAG answer for query: 'Highlight the parts of the text if any related to ...'
Retrieving relevant chunks for query: 'Highlight the parts of the text if any related to ...'
Found 3 relevant chunks.
Answer generated successfully.
Processed 1/10 examples...

Generating RAG answer for query: 'Highlight the parts of the text if any related to ...'
Retrieving relevant chunks for query: 'Highlight the parts of the text if any related to ...'
Found 3 relevant chunks.
Answer generated successfully.
Processed 2/10 examples...

Generating RAG answer for query: 'Highlight the parts of the text if any related to ...'
Retrieving relevant chunks for query: 'Highlight the parts of the text if any related to ...'
Found 3 relevant chunks.
Answer generated successfully.
Processed 3/10 exampl

In [None]:
import openai
import os

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# The fine-tuned model name you want to check
target_fine_tuned_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzWNsgoK"

print(f"Searching for fine-tuning job that resulted in model: {target_fine_tuned_model_name}")

try:
    # List all fine-tuning jobs
    list_jobs_response = openai.fine_tuning.jobs.list()

    found_job = None
    for job in list_jobs_response.data:
        if job.fine_tuned_model == target_fine_tuned_model_name:
            found_job = job
            break # Found the job, no need to continue searching

    if found_job:
        print("\nFound the corresponding fine-tuning job:")
        print(f"Job ID: {found_job.id}")
        print(f"Status: {found_job.status}")
        print(f"Fine-tuned model: {found_job.fine_tuned_model}")

        if found_job.status == 'succeeded':
            print(f"\nYes, the model '{target_fine_tuned_model_name}' has status 'succeeded' and can be used for inference.")
            print("You can now use this model name in API calls for tasks like chat completions.")
        elif found_job.status == 'failed':
             print(f"\nNo, the job for model '{target_fine_tuned_model_name}' failed. The model cannot be used.")
             if found_job.error:
                 print(f"Failure reason: {found_job.error.message}")
        else:
             print(f"\nThe job status for model '{target_fine_tuned_model_name}' is '{found_job.status}'. It is not yet ready for use.")
             print("Please wait for the job status to change to 'succeeded'.")

    else:
        print("\nCould not find a fine-tuning job that resulted in the model name:")
        print(target_fine_tuned_model_name)
        print("Please ensure the model name is correct and the job was initiated from your account.")

except Exception as e:
    print(f"Error listing fine-tuning jobs: {e}")
    print("Please ensure your OpenAI API key is set correctly and you have access to list fine-tuning jobs.")

Searching for fine-tuning job that resulted in model: ft:gpt-4.1-nano-2025-04-14:personal::BzWNsgoK

Found the corresponding fine-tuning job:
Job ID: ftjob-F7deYOicNEfvOYIKbJRLyukm
Status: succeeded
Fine-tuned model: ft:gpt-4.1-nano-2025-04-14:personal::BzWNsgoK

Yes, the model 'ft:gpt-4.1-nano-2025-04-14:personal::BzWNsgoK' has status 'succeeded' and can be used for inference.
You can now use this model name in API calls for tasks like chat completions.


In [None]:
import openai
import os
import json
from collections import Counter
import re

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell


# Replace with the name of your fine-tuned MAUD QA model (which succeeded)
fine_tuned_maud_qa_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzWNsgoK"

# Path to the MAUD development dataset (ensure this path is correct)
dev_maud_qa_file_path = '/content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl'

# Function to normalize text for F1 and EM calculation (reusing from previous evaluations)
def normalize_text(text):
    """Lowercases, removes punctuation and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Function to calculate F1 and Exact Match (reusing from previous evaluations)
def calculate_f1_em(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0, 0

    precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
    recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

    return f1, exact_match

# Load the MAUD development data
dev_maud_qa_data = []
try:
    with open(dev_maud_qa_file_path, 'r') as f:
        for line in f:
            dev_maud_qa_data.append(json.loads(line))
    print(f"Loaded {len(dev_maud_qa_data)} examples from {dev_maud_qa_file_path}")
except FileNotFoundError:
    print(f"Error: MAUD dev file not found at {dev_maud_qa_file_path}. Cannot perform evaluation.")
    dev_maud_qa_data = None
except Exception as e:
    print(f"An error occurred loading MAUD dev data: {e}")
    dev_maud_qa_data = None


if dev_maud_qa_data:
    total_f1 = 0
    total_em = 0
    evaluation_count = 0
    progress_interval = 100 # Print progress every 100 examples


    print(f"\nEvaluating fine-tuned model '{fine_tuned_maud_qa_model_name}' on MAUD dev set using F1 and EM...")

    # Note: Evaluating large datasets can be time-consuming and incur API costs.
    # Consider evaluating on a smaller subset first if needed.
    # For demonstration, let's evaluate on the first 10 examples:
    # data_to_evaluate = dev_maud_qa_data[:10]
    # print(f"Evaluating on a subset of {len(data_to_evaluate)} examples.")
    data_to_evaluate = dev_maud_qa_data


    for i, example in enumerate(data_to_evaluate):
        messages = example.get('messages', [])
        if not messages:
            # print(f"Skipping example with no messages: {example}") # Optional: uncomment for detailed skipping
            continue

        # The last message in the data is the assistant's response (the ground truth answer)
        # For inference with the fine-tuned model, we send messages up to the user's turn
        conversation_for_inference = [msg for msg in messages if msg.get('role') != 'assistant']

        # Extract the ground truth answer from the assistant's message
        ground_truth_answer = ""
        for msg in messages:
            if msg.get('role') == 'assistant':
                ground_truth_answer = msg.get('content', '').strip()
                break # Assuming only one assistant turn at the end


        if conversation_for_inference and ground_truth_answer:
            try:
                # Use the fine-tuned model for inference
                response = openai.chat.completions.create(
                    model=fine_tuned_maud_qa_model_name,
                    messages=conversation_for_inference,
                    max_tokens=150 # Adjust max_tokens as needed for typical answer length
                )

                # Extract the model's predicted answer
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    predicted_answer = response.choices[0].message.content.strip()

                    # Calculate metrics
                    f1, em = calculate_f1_em(predicted_answer, ground_truth_answer)

                    total_f1 += f1
                    total_em += em
                    evaluation_count += 1

                    # Print progress
                    if (i + 1) % progress_interval == 0:
                        print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")

                    # Optional: Print individual results
                    # print(f"\nQuery: {conversation_for_inference[-1]['content'][:100]}...") # Print user query part
                    # print(f"Ground Truth: {ground_truth_answer[:100]}...")
                    # print(f"Prediction: {predicted_answer[:100]}...")
                    # print(f"F1: {f1:.4f}, EM: {em:.4f}")

                else:
                    # print(f"Skipping example {i+1} due to empty model response.") # Optional: uncomment for details
                    pass # Skip examples with no valid prediction

            except Exception as e:
                print(f"Error during inference or evaluation for example {i + 1}: {e}")
                # print(f"Example data: {conversation_for_inference}") # Optional: uncomment for details
                continue # Continue to the next example even if one fails

    # Calculate average metrics
    if evaluation_count > 0:
        average_f1 = total_f1 / evaluation_count
        average_em = total_em / evaluation_count
        print(f"\n--- Fine-tuned MAUD QA Model Evaluation Results ---")
        print(f"Evaluated on {evaluation_count} examples.")
        print(f"Average F1 Score: {average_f1:.4f}")
        print(f"Average Exact Match Score: {average_em:.4f}")
        print(f"-------------------------------------------------")
    else:
        print("\nNo examples were successfully evaluated for the fine-tuned model.")
else:
    print("\nFine-tuned model evaluation could not be performed due to issues loading development data.")

Loaded 100 examples from /content/openai_fine-tuning_data_maud_squad_qa/dev_maud_squad_qa_chat_format.jsonl

Evaluating fine-tuned model 'ft:gpt-4.1-nano-2025-04-14:personal::BzWNsgoK' on MAUD dev set using F1 and EM...
Processed 100/100 examples...

--- Fine-tuned MAUD QA Model Evaluation Results ---
Evaluated on 100 examples.
Average F1 Score: 0.2603
Average Exact Match Score: 0.0100
-------------------------------------------------


In [None]:
import openai
import os
import json

# Ensure your API key is set
# Replace with the name of your fine-tuned PolicyQA model
fine_tuned_policyqa_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW"

# Path to the policyQA development dataset
dev_file_path = '/content/openai_finetuning_data/dev.jsonl'

# Load the development data
dev_data = []
try:
    with open(dev_file_path, 'r') as f:
        for line in f:
            dev_data.append(json.loads(line))
    print(f"Loaded {len(dev_data)} examples from {dev_file_path}")
except FileNotFoundError:
    print(f"Error: Dev file not found at {dev_file_path}. Cannot proceed.")
    dev_data = None
except Exception as e:
    print(f"An error occurred loading dev data: {e}")
    dev_data = None

if dev_data and fine_tuned_policyqa_model_name != "":
    # Select a few examples to use as shots
    # Choose examples that are representative of the task
    # For demonstration, let's take the first 3 examples from the dev set
    num_shots = 3
    few_shot_examples = dev_data[:num_shots]

    # Prepare the few-shot messages
    # These messages will be included before the actual query in the API call
    few_shot_messages = []
    # Add the system message once at the beginning
    if few_shot_examples and few_shot_examples[0].get('messages'):
         # Assuming the system message is the first message in the loaded examples
         system_message = few_shot_examples[0]['messages'][0]
         if system_message.get('role') == 'system':
              few_shot_messages.append(system_message)

    for example in few_shot_examples:
        messages = example.get('messages', [])
        # Add user and assistant turns for each example
        for msg in messages:
            if msg.get('role') in ['user', 'assistant']:
                few_shot_messages.append({"role": msg['role'], "content": msg['content']})


    print(f"\nPrepared {len(few_shot_messages)} few-shot messages ({num_shots} examples).")

    # Now, let's try an inference with a new query from the dev set
    # We'll use the next example in the dev set as our test query
    if len(dev_data) > num_shots:
        test_example = dev_data[num_shots]
        test_messages = test_example.get('messages', [])

        # Find the user message for the test query
        test_user_message = None
        for msg in test_messages:
            if msg.get('role') == 'user':
                test_user_message = msg
                break

        # Find the ground truth assistant message for comparison
        test_ground_truth_answer = ""
        for msg in test_messages:
            if msg.get('role') == 'assistant':
                test_ground_truth_answer = msg.get('content', '').strip()
                break


        if test_user_message:
            # Construct the messages for the API call: few-shot examples + the test query
            inference_messages = few_shot_messages + [test_user_message]

            print(f"\nSending inference request to model '{fine_tuned_policyqa_model_name}' with {len(inference_messages)} messages.")

            try:
                # Use the fine-tuned model for inference with few-shot examples
                response = openai.chat.completions.create(
                    model=fine_tuned_policyqa_model_name,
                    messages=inference_messages,
                    max_tokens=150, # Adjust as needed
                    temperature=0.0 # Keep low for deterministic answers
                )

                # Extract the model's predicted answer
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    predicted_answer = response.choices[0].message.content.strip()

                    print("\n--- Few-Shot Inference Result ---")
                    print(f"Query: {test_user_message.get('content', '')}")
                    print(f"\nPredicted Answer: {predicted_answer}")
                    if test_ground_truth_answer:
                         print(f"\nGround Truth Answer: {test_ground_truth_answer}")
                    print("---------------------------------")

                else:
                    print("\nFew-Shot Inference failed: Model returned an empty response.")

            except Exception as e:
                print(f"\nError during few-shot inference: {e}")
                print("Please ensure your OpenAI API key is set and the model name is correct.")

        else:
            print("\nCould not find a user message in the test example.")

    else:
        print(f"\nNot enough examples in the dev set ({len(dev_data)}) to demonstrate few-shot learning with {num_shots} shots.")

else:
    print("\nFew-shot learning demonstration could not be performed.")

Loaded 3809 examples from /content/openai_finetuning_data/dev.jsonl

Prepared 7 few-shot messages (3 examples).

Sending inference request to model 'ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW' with 8 messages.

--- Few-Shot Inference Result ---
Query: Context: We collect information from and about you. Contact information. For example, we might collect your name and street address. We might also collect your phone number or email.

Question: Does the collected data reveal my identity?

Predicted Answer: name and street address.

Ground Truth Answer: name and street address
---------------------------------


## Evaluating Few-Shot Learning on PolicyQA Development Set

This section evaluates the performance of the few-shot learning approach using the fine-tuned PolicyQA model (`ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW`) on the entire PolicyQA development dataset. We will use the same F1 and Exact Match metrics as before to measure performance.

In [None]:
import openai
import os
import json
from collections import Counter
import re

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Replace with the name of your fine-tuned PolicyQA model
fine_tuned_policyqa_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW"

# Path to the policyQA development dataset
dev_file_path = '/content/openai_finetuning_data/dev.jsonl'

# Function to normalize text for F1 and EM calculation (reusing from previous evaluations)
def normalize_text(text):
    """Lowercases, removes punctuation and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Function to calculate F1 and Exact Match (reusing from previous evaluations)
def calculate_f1_em(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0, 0

    precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
    recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

    return f1, exact_match

# Load the development data
dev_data = []
try:
    with open(dev_file_path, 'r') as f:
        for line in f:
            dev_data.append(json.loads(line))
    print(f"Loaded {len(dev_data)} examples from {dev_file_path}")
except FileNotFoundError:
    print(f"Error: Dev file not found at {dev_file_path}. Cannot perform evaluation.")
    dev_data = None
except Exception as e:
    print(f"An error occurred loading dev data: {e}")
    dev_data = None

if dev_data and fine_tuned_policyqa_model_name != "":
    # Select a few examples to use as shots
    # For evaluation, we need to be careful not to use the test example itself as a shot.
    # We'll use the first `num_shots` examples from the dataset as the fixed few-shot examples for all inferences.
    num_shots = 3 # *** SET NUMBER OF SHOTS HERE ***
    if len(dev_data) < num_shots + 1:
        print(f"Not enough examples in the dev set ({len(dev_data)}) to perform evaluation with {num_shots} shots.")
        dev_data = None # Prevent evaluation if not enough data


if dev_data and fine_tuned_policyqa_model_name != "":
    few_shot_examples = dev_data[:num_shots]

    # Prepare the fixed few-shot messages
    fixed_few_shot_messages = []
    # Add the system message once at the beginning
    if few_shot_examples and few_shot_examples[0].get('messages'):
         system_message = few_shot_examples[0]['messages'][0]
         if system_message.get('role') == 'system':
              fixed_few_shot_messages.append(system_message)

    for example in few_shot_examples:
        messages = example.get('messages', [])
        for msg in messages:
             if msg.get('role') in ['user', 'assistant']:
                fixed_few_shot_messages.append({"role": msg['role'], "content": msg['content']})

    print(f"\nPrepared {len(fixed_few_shot_messages)} fixed few-shot messages ({num_shots} examples) for evaluation.")

    total_f1 = 0
    total_em = 0
    evaluation_count = 0
    progress_interval = 100 # Print progress every 100 examples


    print(f"\nEvaluating few-shot learning on the PolicyQA dev set using F1 and EM...")

    # Evaluate on the entire dataset starting after the few-shot examples
    data_to_evaluate = dev_data[num_shots:]
    print(f"Evaluating on {len(data_to_evaluate)} examples.")

    for i, example in enumerate(data_to_evaluate):
        messages = example.get('messages', [])
        if not messages:
            continue

        user_message_content = ""
        ground_truth_answer = ""

        # Extract user query and ground truth answer
        for msg in messages:
            if msg.get('role') == 'user':
                user_message_content = msg.get('content', '')
            elif msg.get('role') == 'assistant':
                ground_truth_answer = msg.get('content', '').strip()
                break # Assuming only one assistant turn at the end

        if user_message_content and ground_truth_answer:
            try:
                # Construct the messages for the API call: fixed few-shot examples + the current test query
                inference_messages = fixed_few_shot_messages + [{"role": "user", "content": user_message_content}]

                # Use the fine-tuned model for inference with few-shot examples
                response = openai.chat.completions.create(
                    model=fine_tuned_policyqa_model_name,
                    messages=inference_messages,
                    max_tokens=150, # Adjust as needed for typical answer length
                    temperature=0.0 # Keep low for deterministic answers during evaluation
                )

                # Extract the model's predicted answer
                if response.choices and response.choices[0].message and response.choices[0].message.content:
                    predicted_answer = response.choices[0].message.content.strip()

                    # Calculate metrics
                    f1, em = calculate_f1_em(predicted_answer, ground_truth_answer)

                    total_f1 += f1
                    total_em += em
                    evaluation_count += 1

                    # Print progress
                    if (i + 1) % progress_interval == 0:
                        print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")


                else:
                    # print(f"Skipping example {i+1} due to empty model response.") # Optional: uncomment for details
                    pass # Skip examples with no valid prediction

            except Exception as e:
                print(f"Error during inference or evaluation for example {i + 1}: {e}")
                # print(f"Example data: {inference_messages}") # Optional: uncomment for details
                continue # Continue to the next example even if one fails

    # Calculate average metrics
    if evaluation_count > 0:
        average_f1 = total_f1 / evaluation_count
        average_em = total_em / evaluation_count
        print(f"\n--- Few-Shot Learning Evaluation Results on PolicyQA Dev Set ({num_shots} shots) ---")
        print(f"Evaluated on {evaluation_count} examples.")
        print(f"Average F1 Score: {average_f1:.4f}")
        print(f"Average Exact Match Score: {average_em:.4f}")
        print(f"----------------------------------------------------------------")
    else:
        print("\nNo examples were successfully evaluated for few-shot learning.")
else:
    print("\nFew-shot learning evaluation could not be performed due to issues loading development data or insufficient examples.")

Loaded 3809 examples from /content/openai_finetuning_data/dev.jsonl

Prepared 7 fixed few-shot messages (3 examples) for evaluation.

Evaluating few-shot learning on the PolicyQA dev set using F1 and EM...
Evaluating on 3806 examples.
Processed 100/3806 examples...
Processed 200/3806 examples...
Processed 300/3806 examples...
Processed 400/3806 examples...
Processed 500/3806 examples...
Processed 600/3806 examples...
Processed 700/3806 examples...
Processed 800/3806 examples...
Processed 900/3806 examples...
Processed 1000/3806 examples...
Processed 1100/3806 examples...
Processed 1200/3806 examples...
Processed 1300/3806 examples...
Processed 1400/3806 examples...
Processed 1500/3806 examples...
Processed 1600/3806 examples...
Processed 1700/3806 examples...
Processed 1800/3806 examples...
Processed 1900/3806 examples...
Processed 2000/3806 examples...
Processed 2100/3806 examples...
Processed 2200/3806 examples...
Processed 2300/3806 examples...
Processed 2400/3806 examples...
Proces

# Task
Experiment with different numbers of shots for few-shot learning on the PolicyQA development dataset using the fine-tuned gpt-4.1-nano-2025-04-14 model, evaluate the performance (F1 and Exact Match) for each number of shots, and suggest further research ideas in the context of fine-tuning language models for legal QA and RAG systems.

## Experiment with different numbers of shots

### Subtask:
Experiment with different numbers of shots for few-shot learning on the PolicyQA development dataset using the fine-tuned gpt-4.1-nano-2025-04-14 model, evaluate the performance (F1 and Exact Match) for each number of shots, and suggest further research ideas in the context of fine-tuning language models for legal QA and RAG systems.


In [None]:
import openai
import os
import json
from collections import Counter
import re

# Ensure your API key is set
# openai.api_key = os.getenv('OPENAI_API_KEY') # Assuming API key is set globally or in a previous cell

# Replace with the name of your fine-tuned PolicyQA model
fine_tuned_policyqa_model_name = "ft:gpt-4.1-nano-2025-04-14:personal::BzGfWaZW"

# Path to the policyQA development dataset
dev_file_path = '/content/openai_finetuning_data/dev.jsonl'

# Function to normalize text for F1 and EM calculation (reusing from previous evaluations)
def normalize_text(text):
    """Lowercases, removes punctuation and extra whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Function to calculate F1 and Exact Match (reusing from previous evaluations)
def calculate_f1_em(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_common = sum(common.values())

    if num_common == 0:
        return 0, 0

    precision = num_common / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
    recall = num_common / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0

    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    exact_match = 1 if prediction_tokens == ground_truth_tokens else 0

    return f1, exact_match

# Load the development data once
dev_data = []
try:
    with open(dev_file_path, 'r') as f:
        for line in f:
            dev_data.append(json.loads(line))
    print(f"Loaded {len(dev_data)} examples from {dev_file_path}")
except FileNotFoundError:
    print(f"Error: Dev file not found at {dev_file_path}. Cannot perform evaluation.")
    dev_data = None
except Exception as e:
    print(f"An error occurred loading dev data: {e}")
    dev_data = None

if dev_data and fine_tuned_policyqa_model_name != "":
    # Define different numbers of shots to experiment with
    shot_counts = [1, 3, 5, 10] # *** DEFINE SHOT COUNTS HERE ***
    evaluation_results = {}

    for num_shots in shot_counts:
        print(f"\n--- Evaluating with {num_shots} shots ---")

        if len(dev_data) < num_shots + 1:
            print(f"Not enough examples in the dev set ({len(dev_data)}) to perform evaluation with {num_shots} shots. Skipping this shot count.")
            continue # Skip to the next shot count

        few_shot_examples = dev_data[:num_shots]

        # Prepare the fixed few-shot messages
        fixed_few_shot_messages = []
        # Add the system message once at the beginning
        if few_shot_examples and few_shot_examples[0].get('messages'):
             system_message = few_shot_examples[0]['messages'][0]
             if system_message.get('role') == 'system':
                  fixed_few_shot_messages.append(system_message)

        for example in few_shot_examples:
            messages = example.get('messages', [])
            for msg in messages:
                 if msg.get('role') in ['user', 'assistant']:
                    fixed_few_shot_messages.append({"role": msg['role'], "content": msg['content']})

        print(f"Prepared {len(fixed_few_shot_messages)} fixed few-shot messages ({num_shots} examples) for evaluation.")

        total_f1 = 0
        total_em = 0
        evaluation_count = 0
        progress_interval = 100 # Print progress every 100 examples


        # Evaluate on the entire dataset starting after the few-shot examples
        data_to_evaluate = dev_data[num_shots:]
        print(f"Evaluating on {len(data_to_evaluate)} examples.")

        for i, example in enumerate(data_to_evaluate):
            messages = example.get('messages', [])
            if not messages:
                continue

            user_message_content = ""
            ground_truth_answer = ""

            # Extract user query and ground truth answer
            for msg in messages:
                if msg.get('role') == 'user':
                    user_message_content = msg.get('content', '')
                elif msg.get('role') == 'assistant':
                    ground_truth_answer = msg.get('content', '').strip()
                    break # Assuming only one assistant turn at the end

            if user_message_content and ground_truth_answer:
                try:
                    # Construct the messages for the API call: fixed few-shot examples + the current test query
                    inference_messages = fixed_few_shot_messages + [{"role": "user", "content": user_message_content}]

                    # Use the fine-tuned model for inference with few-shot examples
                    response = openai.chat.completions.create(
                        model=fine_tuned_policyqa_model_name,
                        messages=inference_messages,
                        max_tokens=150, # Adjust as needed for typical answer length
                        temperature=0.0 # Keep low for deterministic answers during evaluation
                    )

                    # Extract the model's predicted answer
                    if response.choices and response.choices[0].message and response.choices[0].message.content:
                        predicted_answer = response.choices[0].message.content.strip()

                        # Calculate metrics
                        f1, em = calculate_f1_em(predicted_answer, ground_truth_answer)

                        total_f1 += f1
                        total_em += em
                        evaluation_count += 1

                        # Print progress
                        if (i + 1) % progress_interval == 0:
                            print(f"Processed {i + 1}/{len(data_to_evaluate)} examples...")


                    else:
                        # print(f"Skipping example {i+1} due to empty model response.") # Optional: uncomment for details
                        pass # Skip examples with no valid prediction

                except Exception as e:
                    print(f"Error during inference or evaluation for example {i + 1}: {e}")
                    # print(f"Example data: {inference_messages}") # Optional: uncomment for details
                    continue # Continue to the next example even if one fails

        # Calculate average metrics for the current number of shots
        if evaluation_count > 0:
            average_f1 = total_f1 / evaluation_count
            average_em = total_em / evaluation_count
            evaluation_results[num_shots] = {"average_f1": average_f1, "average_em": average_em}
            print(f"\n--- Results for {num_shots} shots ---")
            print(f"Evaluated on {evaluation_count} examples.")
            print(f"Average F1 Score: {average_f1:.4f}")
            print(f"Average Exact Match Score: {average_em:.4f}")
            print(f"---------------------------------------")
        else:
            print(f"\nNo examples were successfully evaluated for {num_shots} shots.")
            evaluation_results[num_shots] = {"average_f1": None, "average_em": None}

    # Print summary of results across different shot counts
    print("\n--- Summary of Few-Shot Learning Evaluation Results ---")
    for shots, results in evaluation_results.items():
        if results["average_f1"] is not None:
            print(f"{shots} shots: Average F1 = {results['average_f1']:.4f}, Average EM = {results['average_em']:.4f}")
        else:
            print(f"{shots} shots: Evaluation failed.")
    print("----------------------------------------------------")

else:
    print("\nFew-shot learning evaluation could not be performed due to issues loading development data or insufficient examples.")

Loaded 3809 examples from /content/openai_finetuning_data/dev.jsonl

--- Evaluating with 1 shots ---
Prepared 3 fixed few-shot messages (1 examples) for evaluation.
Evaluating on 3808 examples.
Processed 100/3808 examples...
Processed 200/3808 examples...
Processed 300/3808 examples...
Processed 400/3808 examples...
Processed 500/3808 examples...
Processed 600/3808 examples...
Processed 700/3808 examples...
Processed 800/3808 examples...
Processed 900/3808 examples...
Processed 1000/3808 examples...
Processed 1100/3808 examples...
Processed 1200/3808 examples...
Processed 1300/3808 examples...
Processed 1400/3808 examples...
Processed 1500/3808 examples...
Processed 1600/3808 examples...
Processed 1700/3808 examples...
Processed 1800/3808 examples...
Processed 1900/3808 examples...
Processed 2000/3808 examples...
Processed 2100/3808 examples...
Processed 2200/3808 examples...
Processed 2300/3808 examples...
Processed 2400/3808 examples...
Processed 2500/3808 examples...
Processed 2600/

## Present results

### Subtask:
Present the evaluation results for each number of shots in a clear format.


**Reasoning**:
Present the evaluation results for each number of shots in a clear format using the `evaluation_results` dictionary.



In [None]:
# Assuming evaluation_results dictionary is available from the previous cell execution
# It should have the format: {num_shots: {"average_f1": f1_score, "average_em": em_score}}

if 'evaluation_results' in locals() and evaluation_results:
    print("\n--- Few-Shot Learning Evaluation Results on PolicyQA Dev Set ---")
    print("{:<10} {:<15} {:<15}".format("Shots", "Average F1", "Average EM"))
    print("-" * 40)

    # Sort results by number of shots for clear presentation
    sorted_results = sorted(evaluation_results.items())

    for shots, results in sorted_results:
        if results["average_f1"] is not None:
            print("{:<10} {:<15.4f} {:<15.4f}".format(shots, results['average_f1'], results['average_em']))
        else:
            print("{:<10} {:<15} {:<15}".format(shots, "N/A", "N/A"))

    print("-" * 40)
else:
    print("\nEvaluation results not found. Please ensure the few-shot evaluation cell ran successfully.")



--- Few-Shot Learning Evaluation Results on PolicyQA Dev Set ---
Shots      Average F1      Average EM     
----------------------------------------
1          0.4548          0.1098         
3          0.4381          0.0970         
5          0.4408          0.1023         
10         0.4415          0.1006         
----------------------------------------
