In [12]:
# !pip3 install -r ../requirements.txt

In [None]:
# !pip uninstall numpy -y 
# !conda install -c conda-forge numpy -y

In [14]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from argparse import Namespace
from tqdm import tqdm
import logging
import os
import json

In [15]:
# 로거 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 시스템 프롬프트 정의
SYSTEM_PROMPT = """
You are an expert in translating Hanja documents into English.
The documents are written in Joseon, which is one of the Korean dynasties.
Translate the input document into English with the following compact JSON format:
{"translated": {The translated document}}
""".strip()

# 이전에 생성한 데이터프레임 로드 (필요에 따라 경로 수정)
df_sentences_filtered = pd.read_csv("../data/sentencePair_세종실록_filtered.csv")
df_sentences_filtered_sample = df_sentences_filtered.sample(10, random_state=42)


# target_data 생성
target_data = df_sentences_filtered_sample.to_dict('records')

# args 생성
args = Namespace(
    model='gpt-4o-mini',  # 사용할 모델명
    max_completion_tokens=1000,
    temperature=0.7,
    seed=42
)

# make_api_request_data 함수 정의
def make_api_request_data(target_data: List[Dict[str, Any]], args: Namespace) -> List[Dict[str, Any]]:
    # https://platform.openai.com/docs/guides/batch/getting-started 참조
    request_data = []

    for data_idx, datum in enumerate(tqdm(target_data, desc="Making API request data", mininterval=1)):
        # custom_id 생성 (year, month, day, sentence_index 조합)
        custom_id = f"{datum['year']}_{datum['month']}_{datum['day']}_{datum['sentence_index']}"

        request_data.append(
            {
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": args.model,
                    "messages": [
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": datum["hanja"]},
                    ],
                    "max_tokens": args.max_completion_tokens,
                    "temperature": args.temperature,
                    "seed": args.seed
                },
            }
        )

    logger.info(f"Total {len(request_data)} API requests are made!")
    return request_data

# 함수 호출하여 API 요청 데이터 생성
api_requests = make_api_request_data(target_data, args)

# 결과 확인 (예시로 첫 번째 요청 데이터 출력)
print(api_requests[0])

# "../data/ch02_batchinput.jsonl", "rb" 해당경로에 저장
with open("../data/ch03_batchinput_sentencePair_세종실록_filtered_sampled_10.jsonl", "w", encoding="utf-8") as f:
    for request in api_requests:
        f.write(json.dumps(request, ensure_ascii=False) + "\n")

Making API request data: 100%|██████████| 10/10 [00:00<00:00, 295373.52it/s]
INFO:__main__:Total 10 API requests are made!


{'custom_id': '1427_10_17_2', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4o-mini', 'messages': [{'role': 'system', 'content': 'You are an expert in translating Hanja documents into English.\nThe documents are written in Joseon, which is one of the Korean dynasties.\nTranslate the input document into English with the following compact JSON format:\n{"translated": {The translated document}}'}, {'role': 'user', 'content': '○右司諫金孝貞等上疏曰: 竊謂凡所施爲, 視歲豐歉。 比年以來, 水旱相仍, 禾稼不稔, 而今年正値農月, 旱乾爲甚, 殿下(霄) 旰軫念, 救荒恤民之政, 無不畢擧, 德至渥也。 今以忠淸、慶尙、全羅、江原、咸吉等道農事爲優, 許遣軍容敬差官, 將點兵船軍器, 以備不虞, 慮至深也。 然上項各道, 雖不若京畿、黃海、平安道之爲歉, 亦恐未至於豐稔也。 且他道之飢餓者, 將或轉而求食, 及其終也, 土着之民, 不免艱食之憂, 勢所必至, 其於點閱之際, 搔擾人民, 又非一端, 弊固不小。 伏望殿下姑停遣官, 待後豐年, 點考施行, 以慰民生。 上議于政府, 命停之。'}], 'max_tokens': 1000, 'temperature': 0.7, 'seed': 42}}


In [16]:
# --------------------------------------------------------------------------------
import time
from openai import OpenAI

client = OpenAI()

# file upload
batch_input_file = client.files.create(
  file=open("../data/ch03_batchinput_sentencePair_세종실록_filtered_sampled_10.jsonl", "rb"),
  purpose="batch"
)

# Create a new batch
client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "테스트를 위한 배치",
    }
)
start_time = time.time()

# Batch 객체들의 리스트를 가져옵니다.
batches = client.batches.list().data

# 각 배치의 상세 정보를 출력합니다.
for batch in batches[:5]:
    print(f"Batch ID: {batch.id}")
    print(f"Status: {batch.status}")
    print(f"Created At: {batch.created_at}")
    print(f"input_file_id: {batch.input_file_id}")
    print(f"output_file_id: {batch.output_file_id}")
    print("------")
    

# status를 60초에 한번씩 확인하고 'completed'일때까지 로그를 출력
# status가 'completed'가 되면 루프를 탈출합니다.
# 총 걸린 시간을 추가로 출력합니다.
while True:
  status = client.batches.retrieve(client.batches.list().data[0].id).status
  print("time: ", time.strftime('%X', time.localtime()), "   |    status: ", status)
  if status == "completed":
    break
  time.sleep(60) # 60초에 한번씩 확인

print("Total time: ", time.time() - start_time)


# output file을 다운로드 받아서 출력
file_response = client.files.content(client.batches.list().data[0].output_file_id).text.encode('utf-8').decode('unicode-escape')
print(file_response)

file_response[10]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch ID: batch_67443bc16cf08190a7246b9cb492ce01
Status: validating
Created At: 1732524993
input_file_id: file-JVgBMZfhVEAzUnFXwHsAnw
output_file_id: None
------
Batch ID: batch_674338f7552881908768f70dae8e5ea9
Status: completed
Created At: 1732458743
input_file_id: file-Q8mYsoemnm5DxZzjiPHUZY
output_file_id: file-PVsQEdu7xe2Qe1uHSzEUXQ
------
Batch ID: batch_674336d52b688190b0d397478747745f
Status: completed
Created At: 1732458197
input_file_id: file-Cj8Lt9CWeyuJZ925C2ktKF
output_file_id: file-7ztiejaK3fWFWvfVBxZqUg
------
Batch ID: batch_674335e4d4ac81909430e1e4f05dc8a3
Status: completed
Created At: 1732457956
input_file_id: file-L88QzYT6zQbzhDtsH8GCJa
output_file_id: file-T2RKu1GkBaRZvsbmKPZMDM
------
Batch ID: batch_6742fb58f76c8190b8241508f9b54704
Status: completed
Created At: 1732442968
input_file_id: file-FHYHLzZog8azWz7UH8e917
output_file_id: file-EA5oLpKAtv5vA2SEnSdnhX
------


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  17:56:35    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  17:57:35    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  17:58:36    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  17:59:37    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:00:37    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:01:38    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:02:39    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:03:39    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:04:40    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:05:41    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:06:42    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:07:43    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:08:44    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:09:44    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:10:45    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:11:46    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:12:47    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:13:48    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:14:49    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:15:49    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:16:50    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:17:51    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:18:52    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:19:53    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:20:53    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:21:54    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:22:55    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:23:56    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:24:57    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:25:57    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:26:58    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:27:59    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:29:00    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:30:00    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:31:01    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:32:02    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:33:03    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:34:03    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:35:04    |    status:  in_progress


INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_67443bc16cf08190a7246b9cb492ce01 "HTTP/1.1 200 OK"


time:  18:36:05    |    status:  in_progress


KeyboardInterrupt: 

In [8]:
import time
from openai import OpenAI

client = OpenAI()

start_time = time.time()

# Batch 객체들의 리스트를 가져옵니다.
batches = client.batches.list().data

# 각 배치의 상세 정보를 출력합니다.
for batch in batches[:5]:
    print(f"Batch ID: {batch.id}")
    print(f"Status: {batch.status}")
    print(f"Created At: {batch.created_at}")
    print(f"input_file_id: {batch.input_file_id}")
    print(f"output_file_id: {batch.output_file_id}")
    print("------")
    

# status를 60초에 한번씩 확인하고 'completed'일때까지 로그를 출력
# status가 'completed'가 되면 루프를 탈출합니다.
# 총 걸린 시간을 추가로 출력합니다.
while True:
  status = client.batches.retrieve(client.batches.list().data[0].id).status
  print("time: ", time.strftime('%X', time.localtime()), "   |    status: ", status)
  if status == "completed":
    break
  time.sleep(60) # 60초에 한번씩 확인

print("Total time: ", time.time() - start_time)


# output file을 다운로드 받아서 출력
file_response = client.files.content(client.batches.list().data[0].output_file_id).text.encode('utf-8').decode('unicode-escape')
print(file_response)

file_response[10]

Batch ID: batch_67443bc16cf08190a7246b9cb492ce01
Status: in_progress
Created At: 1732524993
input_file_id: file-JVgBMZfhVEAzUnFXwHsAnw
output_file_id: None
------
Batch ID: batch_674338f7552881908768f70dae8e5ea9
Status: completed
Created At: 1732458743
input_file_id: file-Q8mYsoemnm5DxZzjiPHUZY
output_file_id: file-PVsQEdu7xe2Qe1uHSzEUXQ
------
Batch ID: batch_674336d52b688190b0d397478747745f
Status: completed
Created At: 1732458197
input_file_id: file-Cj8Lt9CWeyuJZ925C2ktKF
output_file_id: file-7ztiejaK3fWFWvfVBxZqUg
------
Batch ID: batch_674335e4d4ac81909430e1e4f05dc8a3
Status: completed
Created At: 1732457956
input_file_id: file-L88QzYT6zQbzhDtsH8GCJa
output_file_id: file-T2RKu1GkBaRZvsbmKPZMDM
------
Batch ID: batch_6742fb58f76c8190b8241508f9b54704
Status: completed
Created At: 1732442968
input_file_id: file-FHYHLzZog8azWz7UH8e917
output_file_id: file-EA5oLpKAtv5vA2SEnSdnhX
------
time:  19:01:21    |    status:  in_progress
time:  19:02:22    |    status:  in_progress
time:  19:0

KeyboardInterrupt: 