In [None]:
import os
import pandas as pd
import json

#If working on google collab and wish to save dataset to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import openai
import os
from io import StringIO
import ast

# Set the API key
openai_key_string = "your-key-here" # Replace with your actual API key
os.environ['OPENAI_API_KEY'] = "your-key-here"  # Replace with your actual API key
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
# Function to create a .jsonl batch file for OpenAI Batch API
def create_evaluation_batch_jsonl(n, output_file, model="gpt-4"):
    """
    Create a .jsonl file for OpenAI Batch API requests.

    Parameters:
    - df: pandas DataFrame with columns ['reference', 'prompt', 'Response']
    - output_file: Path to save the .jsonl file
    - model: OpenAI model name to use (default is gpt-4)

    Returns:
    - Path to the created .jsonl file.
    """
    with open(output_file, 'w') as f:
      for idx in range(n):
          request_body = {
              "custom_id": f"request-{idx + 1}",
              "method": "POST",
              "url": "/v1/chat/completions",
              "body": {
                  "model": model,
                  "messages": [
                      {
                          "role": "system",
                          "content": "You are an expert on the Harry Potter series of books. You will generate prompts that will be used to evaluate a subject's familiarity with the Harry Potter series. Use the prompt generation steps carefully."
                      },
                      {
                          "role": "user",
                            "content": ("Objective: Design a prompt that indirectly test a persons familiarity with the Harry Potter series. Each prompt should seem as if it is "
                                        " an excerpt or derived from the books, without explicitly stating so. This will allow a language model or any reader to infer context and complete "
                                        " the statement based on their knowledge of the series."
                                        "Key Steps:"
                                        "Step 1. Generate References: Choose between 1 and 3 characters, elements , objects , or idiosyncrasies that are unique to the 'Harry Potter' world "
                                        "that can be but are not necessarily names. Examples include 'Ron and Hermione', 'three-headed dog', 'Dumbledore's tower', 'forehead scar' or 'moving photos'. "
                                        "They could provide either explicit hints (like the names Ron , Hermione or Dumbledore) or subtle hints ( like a three - headed dog or a forehead scar). "
                                        "There can be multiple references in a prompt like 'Ron, Hermione, wand', 'Harry, broom', 'Dumbledore and Snape'. "
                                        "Step 2. Generate a Prompt: Based on the references, create a sentence or statement derived from or inspired by the series, which requires completion based on "
                                        " the 'Harry Potter' knowledge. It should be crafted such that it does not overtly reveal its origin but gives enough hints for someone familiar with the series. "
                                        "Step 3. Create a Subtlety Score: A score from 1 -10 that indicates how subtle or explicit the reference to 'Harry Potter' is within the prompt ."
                                        " A score of 1 suggests a very subtle , obscure reference that only a die-hard fan might recognize , while a score of 10 implies a very clear such as a prompt containing the names of main characters. "
                                        "Guidelines: "
                                        "- The final output must include 3 parts, references, prompt and subtlety score. "
                                        "- The prompts should be neutral in tone and not explicitly ask questions . Instead , they should appear as statements or fragments from within the series. "
                                        "- Use a mix of well-known and lesser-known elements from the series for a comprehensive testing experience. "
                                        "Here is an example of a prompt: "
                                        "References: Ron, Hermione, wand. "
                                        "Prompt: Ron and Hermione were practicing their spells when Ron accidentally cast a spell that caused... "
                                        "Subtlety Score: 7. "
                                        "Return 1 reference, prompt and subtlety score trio. "
                              )



                      }
                  ],
                  "max_tokens": 1000,
                  "temperature": 0.8
              }
          }
          f.write(json.dumps(request_body) + "\n")
    print(f"Batch file saved to {output_file}")
    return output_file

In [None]:
batch_file_path = create_evaluation_batch_jsonl(300, "harry_potter_eval_prompt_batch.jsonl")

In [None]:
json_file = open(batch_file_path, 'r')
print(json_file.read())

In [None]:
# Function to upload the .jsonl batch file to OpenAI Files API (v1.0.0+)
def upload_batch_file(file_path, purpose="batch", openai_api_key=None):
    """
    Upload a .jsonl file to OpenAI Files API.

    Parameters:
    - file_path: Path to the .jsonl file
    - purpose: Purpose for the file upload (must be "batch")
    - openai_api_key: OpenAI API key (if not set via environment variable)

    Returns:
    - Response from the OpenAI API.
    """
    openai.api_key = openai_api_key if openai_api_key else os.getenv("OPENAI_API_KEY")
    if not openai.api_key:
        raise ValueError("OpenAI API key not provided or set in environment variables.")

    with open(file_path, 'rb') as f:
        response = openai.files.create(  # Updated to use the async version compatible with the new API
            file=f,
            purpose=purpose
        )

    print(f"File uploaded with ID: {response.id}")
    return response

In [None]:
response = upload_batch_file(batch_file_path, purpose="batch", openai_api_key = openai_key_string)

In [None]:
def create_batch(file_id, endpoint="/v1/chat/completions", completion_window="24h"):
    """
    Create a batch using OpenAI's Batch API.

    Parameters:
    - file_id: The ID of the uploaded .jsonl file
    - endpoint: The endpoint to process the batch (default is "/v1/chat/completions")
    - completion_window: The time window for batch completion (default is "24h")

    Returns:
    - Response from the OpenAI API.
    """
    openai_client = openai.OpenAI()

    response = openai_client.batches.create(
        input_file_id=file_id,
        endpoint=endpoint,
        completion_window=completion_window
    )

    print(f"Batch created with ID: {response.id}")
    return response

In [None]:
batch_response = create_batch(response.id)

In [None]:
# Function to check the status of a batch
def check_batch_status(batch_id):
    """
    Check the status of a batch.

    Parameters:
    - batch_id: The ID of the batch to check.

    Returns:
    - Response from the OpenAI API with batch status.
    """
    openai_client = openai.OpenAI()

    response = openai_client.batches.retrieve(batch_id)
    print(f"Batch status: {response.status}")
    return response

# status_response = check_batch_status(batch_response['id'])

In [None]:
status_response = check_batch_status(batch_response.id)

In [None]:
# Function to retrieve batch results and store in a DataFrame
def get_batch_results(batch_id):
    """
    Retrieve the results for a given batch ID and store them in a DataFrame.

    Parameters:
    - batch_id: The ID of the batch whose results are to be retrieved.

    Returns:
    - A pandas DataFrame containing the results.
    """
    batch_details = openai.batches.retrieve(batch_id)
    result_file_id = batch_details.output_file_id

    if not result_file_id:
        raise ValueError("No result file ID found for the batch. The batch may not be complete yet.")

    # Download the result file
    file_response = openai.files.content(result_file_id)

    # Convert to string content for parsing
    file_content = file_response.read().decode('utf-8') if hasattr(file_response, 'read') else str(file_response)

    # Parse results line by line into a list of dictionaries
    results = [json.loads(line) for line in StringIO(file_content).readlines()]

    # Convert to a DataFrame
    df_results = pd.DataFrame(results)
    print(f"Retrieved {len(df_results)} results.")
    return df_results

In [None]:
results_df = get_batch_results(batch_response.id)

In [None]:
def extract_fields(row):
  """
  A function to extract the references, prompt and subtlety score from the returned json structure
  """
    try:
        # Print the index of the row
        print(f"Processing row index: {row.name}")

        # Extract the 'response' column value
        response = row['response']

        # Convert string to dictionary
        # response_dict = ast.literal_eval(response)
        response_dict = response
        content = response_dict['body']['choices'][0]['message']['content']

        # Extract References
        references = content.split("References:")[1].split("Prompt:")[0].strip()

        # Extract Prompt
        prompt = content.split("Prompt:")[1].split("Subtlety")[0].strip()

        # Extract Subtlety (handles both "Subtlety" and "Subtlety Score")
        if "Subtlety Score:" in content:
            subtlety = content.split("Subtlety Score:")[1].strip()
        elif "Subtlety:" in content:
            subtlety = content.split("Subtlety:")[1].strip()
        else:
            subtlety = None  # Fallback if neither is found

        return pd.Series([references, prompt, subtlety])
    except Exception as e:
        print(f"Error parsing response at row index {row.name}: {e}")
        return pd.Series([None, None, None])

# Apply the function row-wise
results_df[['References', 'Prompt', 'Subtlety']] = results_df.apply(extract_fields, axis=1)

In [None]:
results_df.head()