In [None]:
!pip install openai

The API key is provided on the LMS.

In [None]:
from openai import OpenAI
import json

client = OpenAI(api_key="ENTER YOUR API KEY HERE")

# Batched API Requests

API requests are prone to error as they require a stable connection between client and server. A more efficient approach is to send all requests to the server at once and let the server decide how and when to handle them. This approach of bundling requests is called batching. OpenAI gives 50% discount on batched requests. However, as the process is asynchronus, you do not know when your batch will be ready. 

## Read in Data

In [1]:
import pandas as pd
corpus = pd.read_csv("https://github.com/casbdai/notebooks/raw/main/Module2/Textmining/fake_news.csv")
corpus.head()

Unnamed: 0,title,text,label
0,"With French Socialists in Crisis, Manuel Valls...","BOISSEUIL, France — A furious Ducourtioux...",Fake
1,All Donated Blood in U.S. Should Be Tested for...,The Food and Drug Administration on Friday too...,Fake
2,Eight Immigration Victories Won for Americans ...,Last year was a success for Americans who are ...,Fake
3,How To WATCH The Highly Anticipated ‘CLINTON C...,SHARE this link with everyone you know. EVERY ...,Real
4,CNBC: China’s Secret Plan to Crush the U.S. Sp...,"In a feature on Tuesday, CNBC explained how th...",Fake


## Create Batch Request

Define prompts and a function to create the request

In [None]:
prompt = "Score the likelihood that the text contains fakenews."
system_addition = " Score the text with a likelihood between 0 (very low) and 1 (very high).\nReturn the score as JSON with the key 'score'. Make sure that only the JSON is returned."

def create_batch_job(prompt: str, system_addition: str, df: pd.DataFrame, row_name="text", temperature=0.1) -> str:
  tasks = []

  for index, row in df.iterrows():
      description = row[row_name]
      
      task = {
          "custom_id": f"task-{index}",
          "method": "POST",
          "url": "/v1/chat/completions",
          "body": {
              # This is what you would have in your Chat Completions API call
              "model": "gpt-3.5-turbo",
              "temperature": temperature,
              "response_format": { 
                  "type": "json_object"
              },
              "messages": [
                  {
                      "role": "system",
                      "content": prompt + system_addition
                  },
                  {
                      "role": "user",
                      "content": description[:400]
                  }
              ],
          }
      }
      
      tasks.append(task)

  file_name = "batch_tasks.jsonl"

  with open(file_name, 'w') as file:
      for obj in tasks:
          file.write(json.dumps(obj) + '\n')

  batch_file = client.files.create(
    file=open(file_name, "rb"),
    purpose="batch"
  )

  batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
  )

  return batch_job.id

Execute the function to create the batch request

In [None]:
batch_job_id = create_batch_job(prompt, system_addition, corpus.head())

You can check the status of your job by executing the following cell. If it is not ready yet wait a moment and run the cell again

In [None]:
batch_job = client.batches.retrieve(batch_job_id)
print(batch_job)
job_finished = batch_job.status == "completed"
print(f"Ready? {job_finished}")

## Retrieve Results

Define a function to retrieve the results.

In [2]:
def retrieve_results(result_file_id: str) -> pd.DataFrame:

  result = client.files.content(result_file_id).content
  result_file_name = "batch_job_results.jsonl"

  with open(result_file_name, 'wb') as file:
      file.write(result)

  # Loading data from saved file
  results = []
  with open(result_file_name, 'r') as file:
      for line in file:
          # Parsing the JSON string into a dict and appending to the list of results
          json_object = json.loads(line.strip())
          results.append(json_object)

  result_df = pd.DataFrame(columns=["score"])

  for res in results:
      task_id = res['custom_id']
      # Getting index from task id
      index = task_id.split('-')[-1]
      score = json.loads(res['response']['body']['choices'][0]['message']['content'])["score"]
      result_df.loc[index, "score"] = score

  return result_df

Execute the function to get the results as a pandas DataFrame

In [None]:
retrieve_results(batch_job.output_file_id)