In [None]:
import pandas as pd
from tqdm import tqdm
import json
from datetime import datetime
import time
import matplotlib.pyplot as plt
import random
import requests
import ast
import logging
from dotenv import load_dotenv
import re
import os
from pathlib import Path
from glob import glob
import numpy as np
from openai import OpenAI
from openai import RateLimitError, APIError, APIConnectionError, APITimeoutError, InternalServerError

# API keys
Easily manage your API keys by setting up an .env file and updating it programmatically. This prevents data leaks and enhances security by keeping sensitive information out of your code. Always set these keys as environment variables and restart the kernel after updates.

In [None]:
def add_or_update_token(env_file, token_name, token_value):
    # Read or create an existing .env file
    env_file_path = Path(env_file)
    if not env_file_path.exists():
        print(f"{env_file} does not exist.")
        # create an .env file
        env_file_path.touch()
        print(f".env file created at {env_file_path}")
    
    # Read lines from the .env file
    with open(env_file, 'r') as file:
        print(f"{env_file} exists.")
        lines = file.readlines()
    
    # Track if the token was updated
    token_exists = False
    
    # Modify the existing token if it exists
    for i, line in enumerate(lines):
        if line.startswith(f"{token_name}="):
            print(f"{token_name} already exists.")
            lines[i] = f"{token_name}={token_value}\n"
            token_exists = True
            break
    
    # If the token does not exist, append it
    if not token_exists:
        print(f"Add {token_name}.")
        lines.append(f"{token_name}={token_value}\n")
    
    # Write the lines back to the .env file
    with open(env_file, 'w') as file:
        file.writelines(lines)
    
    print(f"Token {token_name} has been {'updated' if token_exists else 'added'} successfully.")

In [None]:
# Create and store keys
add_or_update_token('your .env file path', 'OPENAI_API_KEY', 'your API key')

In [None]:
# Load keys
folder_path = '../APIS'
env_file_path = os.path.join(folder_path, '.env')

with open(env_file_path, 'r') as file:
    # print(file.read())
    print('.env file exists')

load_dotenv(dotenv_path = env_file_path)
openai_api_key = os.getenv('OPENAI_API_KEY')
# openai_org_id = os.getenv('OPENAI_ORG_ID') # add if needed

# # Set as environment variables
os.environ['OPENAI_API_KEY'] = openai_api_key
# os.environ['OPENAI_ORG_ID'] = openai_org_id # add if needed

# Load and preprocess your data

In [None]:
data = pd.read_csv('your raw data')

# Setup model

In [None]:
# Initializing OpenAI client
client = OpenAI()

# Choose your model
model_name = "gpt-4o"

In [None]:
# For example

label_system_prompt = '''
As a political researcher analyzing U.S. elections, your goal is to analyze the topics in the provided post.
Please provide the probability that the given text refers to a specific topic, with 1% indicating very unlikely and 99% indicating extremely likely.
You will need to output a JSON object.

{
    "election": int,    // The probability that the text references the 2024 Presidential Election.
    "protest": int,     // The probability that the text references some form of protest.
    "nation": int,      // The probability that the text references country/countries.
    "technology": int,  // The probability that the text references technology.
    "religion": int,    // The probability that the text references religious themes or religious symbols.
    "war": int,         // The probability that the text references ongoing wars such as Israel/Gaza, Russia/Ukraine, or China/Taiwan.
    "politicians": int, // The probability that the text references political rallies or specific politicians.
    "energy": int,      // The probability that the text references climate change or energy issues.
    "economy": int,     // The probability that the text references economic issues.
    "immigration": int, // The probability that the text references immigration.
    "healthcare": int,  // The probability that the text references healthcare.
    "abortion": int,    // The probability that the text references abortion.
    "race": int         // The probability that the text references issues of race.
    "others": int       // The probability that the text references topics not listed above.
}
'''

# Create tasks
- Ensure `custom_id` is unique, and no null values exist in `custom_id` or `messages`. 
- The output must be saved as an object in a `.jsonl` file.

In [None]:
def createTask(row):
    return {
        "custom_id": str(row.id), # must ensure the id is string and unique
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model_name,
            "temperature": 0.1,
            "max_tokens": 150,
            "response_format": {
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": label_system_prompt
                },
                {
                    "role": "user",
                    "content": row.text # this is the text you want to analyze
                }
            ],
        }
    }

In [None]:
tasks = []

for idx, row in tqdm(data.iterrows(), total=data.shape[0], desc='Creating tasks'):
    task = createTask(row)
    tasks.append(task)

print(f'Number of tasks: {len(tasks)}')

In [None]:
# Save tasks
file_name = 'your_batch_tasks.jsonl'

with open(file_name, 'w') as f:
    for obj in tasks:
        f.write(json.dumps(obj) + '\n')

# Batch Functions for OpenAI API

1. **`writeBatchFile()`**  
   Splits tasks into multiple smaller files to avoid hitting API rate limits. Each file contains a manageable number of tasks and is saved as `.jsonl`.


2. **`uploadBatchFile()`**  
   Uploads the batch files to the OpenAI batch API.


3. **`createBatchJob()`**  
   Initiates a batch job on the OpenAI batch API using the uploaded batch files.


4. **`checkLastBatchTask()`**  
   Verifies the last processed batch file ID and continues indexing from where it left off, avoiding duplicate processing or missed tasks.

In [None]:
def writeBatchFile(batch_data, file_name, directory='batch_tasks'):
    # Create the directory if it doesn't exist
    path = Path(directory)
    path.mkdir(parents=True, exist_ok=True)

    # Construct the full file path
    file_path = path / file_name

    try:
        with file_path.open('w') as f:
            for obj in batch_data:
                f.write(json.dumps(obj) + '\n')
        print(f'File {file_path} created with {len(batch_data)} requests.', end='\n\n')

    except IOError as e:
        print(f"Error writing to file {file_path}: {e}")

In [None]:
# Upload batch file
def uploadBatchFile(file_name, directory='batch_tasks'):
    file_path = Path(directory) / file_name
    
    if not file_path.exists():
        print(f"Error: File {file_path} does not exist.")
        return None

    print('Uploading batch file...')

    try:
        with file_path.open('rb') as file:
            batch_file = client.files.create(
                file=file,
                purpose="batch"
            )
        print('Batch file name:', batch_file.filename)
        print('Batch file ID:', batch_file.id)
        print('Batch file status:', batch_file.status, end='\n\n')
        
    except Exception as e:
        print(f"Error uploading file: {e}")
        return None

    return batch_file

In [None]:
# Create batch job
# Do not change endpoint and completion_window here untill OpenAI provides more choices
def createBatchJob(batch_file, endpoint="/v1/chat/completions", completion_window="24h"): 
    print('Creating batch job...')
    try:
        batch_job = client.batches.create(
            input_file_id=batch_file.id,
            endpoint=endpoint,
            completion_window=completion_window
        )
        print('Batch job ID:', batch_job.id)
        
    except Exception as e:
        print(f"Error creating batch job: {e}")
        return None

    return batch_job

In [None]:
def checkBatchJobStatus(batch_job_id, check_interval=3): #unit of check_interval: minute
    batch_job = client.batches.retrieve(batch_job_id)
    final_statuses = {'completed', 'failed', 'expired', 'cancelled'}

    while True:
        try:
            batch_job = client.batches.retrieve(batch_job_id)
            current_status = batch_job.status.lower()
            
            print(f"Current status of job {batch_job_id}: {current_status}")
            
            if current_status in final_statuses:
                print(f"Job {batch_job_id} has reached a final status: {current_status}")
                return current_status
            
            if current_status == 'finalizing':
                print(f"Job {batch_job_id} is finalizing. Checking again in 2 minutes...")
                time.sleep(2 * 60)

            else:
                print(f"Job {batch_job_id} is still {current_status}. Checking again in {check_interval} minutes...")
                time.sleep(check_interval * 60)  # Convert minutes to seconds
        
        except Exception as e:
            print(f"An error occurred while checking job {batch_job_id}: {str(e)}")
            print(f"Retrying in {check_interval} minutes...")
            time.sleep(check_interval * 60)

        print('===========================', end='\n\n')

In [None]:
def checkLastBatchTask():
   # check numbers of previous batch jobs
    pre_batch_jobs = glob('batch_tasks/*.jsonl')
    print(f'Number of previous batch jobs: {len(pre_batch_jobs)}')

    # Define the directory containing the files
    directory = 'batch_tasks'

    # Initialize a variable to store the largest ID
    largest_id = -1

    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Use a regular expression to match the pattern and extract the ID
        match = re.search(r'your task name(\d+)\.jsonl$', filename)
        
        if match:
            # Convert the extracted ID to an integer
            file_id = int(match.group(1))
            
            # Update the largest ID if the current one is greater
            if file_id > largest_id:
                largest_id = file_id

    print(f"The largest ID found is: {largest_id}")
    return largest_id

In [None]:
largest_id = check_last_batch_task()

In [None]:
# Set the maximum number of requests
batch_size = 20000 # adjust the size here according to your permit level
num_files = (len(tasks) + batch_size - 1) // batch_size
print(f'Total requests: {len(tasks)}. Batch size: {batch_size}. Separated in {num_files} files.', end='\n\n')

# batch_job_records = []

# Save data in chunks
for i in range(num_files):
    start_index = i * batch_size
    end_index = start_index + batch_size
    
    # Slice the list to get the current batch
    batch_data = tasks[start_index:end_index]
    
    # Create a filename
    file_name = f'your task name{i+1+largest_id}.jsonl'

    # record the batch job if required
#     batch_job_records.append({
#         'file_name': file_name,
#         'start_index': start_index,
#         'end_index': end_index
#     })
    
    # Write batch tasks
    writeBatchFile(batch_data, file_name)

    # Upload batch file
    batch_file = uploadBatchFile(file_name)
    if not batch_file:
        print(f"Failed to upload file {file_name}. Skipping this batch.")
        break
        # continue
    
    # Create batch job
    batch_job = createBatchJob(batch_file)
    if not batch_job:
        print(f"Failed to create batch job for file {file_name}.")
        print(batch_job.errors.data)
        continue
    
    # Check batch job status, until it reaches a final status
    final_statuses = {'completed', 'failed', 'expired', 'cancelled'}
    final_status = checkBatchJobStatus(batch_job.id)
    if final_status in final_statuses:
        print(f"Batch job {batch_job.id} is {final_status}.")
        continue

print("Batch processing completed.")

# save batch job records as a jsonl file
# with open('batch_job_records.jsonl', 'w') as f:
#     for obj in batch_job_records:
#         f.write(json.dumps(obj) + '\n')
# print('Batch job records saved.')

# Retrieve the results

In [None]:
output_directory = 'batch_output'

In [None]:
def checkLastBatchOutput(output_directory):

    # Check numbers of previous batch output
    pre_batch_jobs = glob(f'{output_directory}/*.json')
    print(f'Number of previous batch output: {len(pre_batch_jobs)}')

    # Check if directory exists
    if not os.path.exists(output_directory):
        print(f"Directory {output_directory} does not exist.")
        return -1

    # Initialize a variable to store the largest ID
    largest_id = -1

    # Loop through all files in the directory
    for filename in os.listdir(output_directory):
        match = re.search(r'your task result(\d+)\.json$', filename)
        if match:
            file_id = int(match.group(1))
            if file_id > largest_id:
                largest_id = file_id
          
    if largest_id == -1:
        print("No valid IDs found in the directory.")
    else:
        print(f"The largest ID found is: {largest_id}")

    return largest_id

In [None]:
largest_id = checkLastBatchOutput(output_directory)

In [None]:
output_files = []

# Fetch batches from the client.
# Replace 'specific_batch_id' with the ID just above your desired batch ID 
# (as the OpenAI Batch API dashboard lists batches from latest to previous).
# Note: You can use either `after` to specify the starting batch ID 
# or `limit` to define the maximum number of batches to retrieve, but not both simultaneously.
for batch in client.batches.list(after='specific_batch_id', limit=50):  
    # Replace 'specific_batch_id' with the batch ID above the target or remove `limit` to use one parameter.

#     if batch.id == 'batch_id_to_exclude':  # Skip the batch with the specified ID if needed
#         break

    print(f"Batch ID: {batch.id}, Status: {batch.status}")

    # Collect only completed batches, skipping failed or canceled ones
    if batch.status == 'completed':
        output_files.append([batch.id, batch.created_at, batch.output_file_id])

# Print the total number of completed batches retrieved
print(f"Total completed batches: {len(output_files)}")

In [None]:
# Optional: sort by created time
output_files = sorted(output_files, key=lambda x: x[1])
len(output_files)

In [None]:
for idx, output_file in enumerate(output_files):
    path = Path(output_directory)
    path.mkdir(parents=True, exist_ok=True)

    output_file_name = f'your task result{idx+1+largest_id}.json'
    # break
    file_path = path / output_file_name
    print(file_path)

    result = client.files.content(output_files[idx][2]).content
    with open(file_path, 'wb') as file:
        file.write(result)

# Organize results

In [None]:
results = []
directory = 'batch_output'
output_files = glob(f'{directory}/*.json')

# Iterate over each file in the directory
for file_path in tqdm(output_files, total=len(output_files), desc='Processing files'):
    # print(f"Processing file: {file_path}")
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the JSON string into a dictionary
            json_object = json.loads(line.strip())
            
            # Extract the custom_id
            custom_id = json_object.get('custom_id', None)
            
            # Extract the response body and parse the content to get values
            response_content = json.loads(json_object['response']['body']['choices'][0]['message']['content'])
            
            # Add the custom_id to the dictionary
            response_content['id'] = custom_id
            
            # Append to results
            results.append(response_content)

In [None]:
# process your results here