In [12]:
# !pip3 install -r ../requirements.txt

In [None]:
# !pip uninstall numpy -y 
# !conda install -c conda-forge numpy -y

In [3]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from argparse import Namespace
from tqdm import tqdm
import logging
import os
import json

In [4]:
# 로거 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 시스템 프롬프트 정의
SYSTEM_PROMPT = """
You are an expert in translating Hanja documents into English.
The documents are written in Joseon, which is one of the Korean dynasties.
Translate the input document into English with the following compact JSON format:
{"translated": {The translated document}}
""".strip()

# 이전에 생성한 데이터프레임 로드 (필요에 따라 경로 수정)
df_sentences_filtered = pd.read_csv("../data/sentencePair_세종실록_filtered.csv")
df_sentences_filtered_sample = df_sentences_filtered.sample(1000, random_state=42)

# target_data 생성
target_data = df_sentences_filtered_sample.to_dict('records')

# args 생성
args = Namespace(
    model='gpt-4o',  # 사용할 모델명
    max_completion_tokens=1000,
    temperature=0.0,
    seed=42
)

# make_api_request_data 함수 정의
def make_api_request_data(target_data: List[Dict[str, Any]], args: Namespace) -> List[Dict[str, Any]]:
    # https://platform.openai.com/docs/guides/batch/getting-started 참조
    request_data = []

    for data_idx, datum in enumerate(tqdm(target_data, desc="Making API request data", mininterval=1)):
        # custom_id 생성 (year, month, day, sentence_index 조합)
        custom_id = f"{datum['year']}_{datum['month']}_{datum['day']}_{datum['sentence_index']}"

        request_data.append(
            {
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": args.model,
                    "messages": [
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": datum["hanja"]},
                    ],
                    "max_tokens": args.max_completion_tokens,
                    "temperature": args.temperature,
                    "seed": args.seed
                },
            }
        )

    logger.info(f"Total {len(request_data)} API requests are made!")
    return request_data

# 함수 호출하여 API 요청 데이터 생성
api_requests = make_api_request_data(target_data, args)

# 결과 확인 (예시로 첫 번째 요청 데이터 출력)
print(api_requests[0])

# "../data/ch02_batchinput.jsonl", "rb" 해당경로에 저장
with open("../data/ch03_batchinput_sentencePair_세종실록_filtered_sampled_1000_gpt4o.jsonl", "w", encoding="utf-8") as f:
    for request in api_requests:
        f.write(json.dumps(request, ensure_ascii=False) + "\n")

Making API request data: 100%|██████████| 1000/1000 [00:00<00:00, 562993.83it/s]
INFO:__main__:Total 1000 API requests are made!


{'custom_id': '1427_10_17_2', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'gpt-4o', 'messages': [{'role': 'system', 'content': 'You are an expert in translating Hanja documents into English.\nThe documents are written in Joseon, which is one of the Korean dynasties.\nTranslate the input document into English with the following compact JSON format:\n{"translated": {The translated document}}'}, {'role': 'user', 'content': '○右司諫金孝貞等上疏曰: 竊謂凡所施爲, 視歲豐歉。 比年以來, 水旱相仍, 禾稼不稔, 而今年正値農月, 旱乾爲甚, 殿下(霄) 旰軫念, 救荒恤民之政, 無不畢擧, 德至渥也。 今以忠淸、慶尙、全羅、江原、咸吉等道農事爲優, 許遣軍容敬差官, 將點兵船軍器, 以備不虞, 慮至深也。 然上項各道, 雖不若京畿、黃海、平安道之爲歉, 亦恐未至於豐稔也。 且他道之飢餓者, 將或轉而求食, 及其終也, 土着之民, 不免艱食之憂, 勢所必至, 其於點閱之際, 搔擾人民, 又非一端, 弊固不小。 伏望殿下姑停遣官, 待後豐年, 點考施行, 以慰民生。 上議于政府, 命停之。'}], 'max_tokens': 1000, 'temperature': 0.0, 'seed': 42}}


In [36]:
# --------------------------------------------------------------------------------
import time
from openai import OpenAI

client = OpenAI()

# file upload
batch_input_file = client.files.create(
  file=open("../data/ch03_batchinput_sentencePair_세종실록_filtered_sampled_1000_gpt4o-mini.jsonl", "rb"),
  purpose="batch"
)

# Create a new batch
client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "조선왕조실록 중 세종실록 1000문장 번역",
    }
)
start_time = time.time()

# Batch 객체들의 리스트를 가져옵니다.
batches = client.batches.list().data

# 각 배치의 상세 정보를 출력합니다.
for batch in batches[:5]:
    print(f"Batch ID: {batch.id}")
    print(f"Status: {batch.status}")
    print(f"Created At: {batch.created_at}")
    print(f"input_file_id: {batch.input_file_id}")
    print(f"output_file_id: {batch.output_file_id}")
    print("------")
    

# status를 60초에 한번씩 확인하고 'completed'일때까지 로그를 출력
# status가 'completed'가 되면 루프를 탈출합니다.
# 총 걸린 시간을 추가로 출력합니다.
while True:
  status = client.batches.retrieve(client.batches.list().data[0].id).status
  print("time: ", time.strftime('%X', time.localtime()), "   |    status: ", status)
  if status == "completed":
    break
  time.sleep(60) # 60초에 한번씩 확인

print("Total time: ", time.time() - start_time)


# output file을 다운로드 받아서 출력
file_response = client.files.content(client.batches.list().data[0].output_file_id).text.encode('utf-8').decode('unicode-escape')
print(file_response)

file_response[10]

FileNotFoundError: [Errno 2] No such file or directory: '../data/ch03_batchinput_sentencePair_세종실록_filtered_sampled_1000_gpt4o-mini.jsonl'

In [30]:
import time
from openai import OpenAI

client = OpenAI()

start_time = time.time()

# Batch 객체들의 리스트를 가져옵니다.
batches = client.batches.list().data

# 각 배치의 상세 정보를 출력합니다.
for batch in batches[:5]:
    print(f"Batch ID: {batch.id}")
    print(f"Status: {batch.status}")
    print(f"Created At: {batch.created_at}")
    print(f"input_file_id: {batch.input_file_id}")
    print(f"output_file_id: {batch.output_file_id}")
    print("------")

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch ID: batch_674cec1cbee48190929cac2a3f8d8035
Status: completed
Created At: 1733094428
input_file_id: file-GtaQcyhursUmMzCrwZqcg4
output_file_id: file-KyBP7HotL1FCfyyBzmSNNi
------
Batch ID: batch_674cea18f7308190b8434a31c44c742a
Status: completed
Created At: 1733093913
input_file_id: file-KWWGExKBdBqT71yvAjLTeX
output_file_id: file-Tbb5nPoCtGkCs5ZFdmCJiE
------
Batch ID: batch_674ce81a4e2881908cf62316d25fc421
Status: completed
Created At: 1733093402
input_file_id: file-CY5QRmsdwf4qWKdqC11o29
output_file_id: file-UKx4DD3UPzxcZtpUA8pq5C
------
Batch ID: batch_674ce67a7534819083224a4590f28ce1
Status: completed
Created At: 1733092986
input_file_id: file-DQqHndwKV5tV7UTNXfhvwL
output_file_id: file-LLtT841NgCFziQHXtdY75h
------
Batch ID: batch_674ce48716e481909f12cf9373e04eab
Status: completed
Created At: 1733092487
input_file_id: file-7jN2BYJF1Mtf2vJDErNnky
output_file_id: file-UUTZd5Bzn7B1PKVWYW8XW6
------


In [11]:
client.batches.retrieve(client.batches.list().data[0].id).status

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches/batch_674cec1cbee48190929cac2a3f8d8035 "HTTP/1.1 200 OK"


'completed'

In [35]:
input_file_id = "file-DQqHndwKV5tV7UTNXfhvwL"
# 출력 파일 내용 가져오기
response = client.files.content(input_file_id)

# 응답 내용을 UTF-8로 디코딩하여 문자열로 변환
file_content = response.content.decode('utf-8')

# 파일 내용 출력
print(file_content)

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/files/file-DQqHndwKV5tV7UTNXfhvwL/content "HTTP/1.1 200 OK"


{"custom_id":"0","method":"POST","url":"\/v1\/chat\/completions","body":{"model":"gpt-4o-2024-11-20","messages":[{"role":"system","content":"You are an expert in restoring damaged Hanja characters.\n\nInput Sentence: \"\u25a1\u25a1\u25a1\u6975, \u6cbb\u4e16\u5b89\u6c11.\"\n\nTasks:\n1. Count the exact number of `\u25a1` tokens in the sentence. Each `\u25a1` must be counted individually.\n2. Replace each `\u25a1` with exactly one Hanja character. Ensure that no `\u25a1` is left unprocessed and that each `\u25a1` corresponds to exactly one Hanja character.\n\nStep-by-step output in JSON format:\n{\n  \"num_of_damage_token\": \"{Number of `\u25a1` tokens}\",\n  \"restorations\": [\n    {\"damage_token\": \"\u25a1 1\", \"restored_hanja\": \"{Restored Character 1}\"},\n    {\"damage_token\": \"\u25a1 2\", \"restored_hanja\": \"{Restored Character 2}\"},\n    {\"damage_token\": \"\u25a1 3\", \"restored_hanja\": \"{Restored Character 3}\"}\n  ],\n  \"restored_hanja\": \"{Restored Sentence}\",

In [27]:
output_file_id = "file-UKx4DD3UPzxcZtpUA8pq5C"
# 출력 파일 내용 가져오기
response = client.files.content(output_file_id)

# 응답 내용을 UTF-8로 디코딩하여 문자열로 변환
file_content = response.content.decode('utf-8')

# 파일 내용 출력
print(file_content)

INFO:httpx:HTTP Request: GET https://api.openai.com/v1/files/file-UKx4DD3UPzxcZtpUA8pq5C/content "HTTP/1.1 200 OK"


{"id": "batch_req_674ce83628788190b5aaebfcdb9b98aa", "custom_id": "0", "response": {"status_code": 200, "request_id": "6da7a730f9478a5890031aa4f374c64d", "body": {"id": "chatcmpl-AZn3geR8d6VrHuCLgoQJzMAFCXdmU", "object": "chat.completion", "created": 1733093424, "model": "gpt-4o-2024-11-20", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\n  \"num_of_damage_token\": \"4\",\n  \"restored_hanja\": \"\u5167\u9662\u5553\u66f0, \u660e\u65e8\u5ba3\u00b7\u7d93\u7b75, \u53d6\u7a1f. \u50b3\u66f0, \u505c.\"\n}", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 164, "completion_tokens": 46, "total_tokens": 210, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_prediction_tokens": 0}}, "system_fingerprint": "fp_24bb6f9e50"}}, "error": null}
{"id": "batch_req_674ce8363a8c819083e0bb595bdaa005", "custom_id":

In [37]:
# --------------------------------------------------------------------------------
import time
from openai import OpenAI

client = OpenAI()

# file upload
batch_input_file = client.files.create(
  file=open("../data/batch_inputs/ch03_batchinput_sentencePair_세종실록_filtered_sampled_1000_gpt4o-mini.jsonl", "rb"),
  purpose="batch"
)

# Create a new batch
client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "조선왕조실록 중 세종실록 1000문장 번역",
    }
)
start_time = time.time()

# Batch 객체들의 리스트를 가져옵니다.
batches = client.batches.list().data

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


In [38]:
# 각 배치의 상세 정보를 출력합니다.
for batch in batches[:5]:
    print(f"Batch ID: {batch.id}")
    print(f"Status: {batch.status}")
    print(f"Created At: {batch.created_at}")
    print(f"input_file_id: {batch.input_file_id}")
    print(f"output_file_id: {batch.output_file_id}")
    print("------")

Batch ID: batch_674d0a21e67c8190b7a3f5bd86003cf1
Status: validating
Created At: 1733102114
input_file_id: file-FcoLsag2jHfAroeo6eGXhv
output_file_id: None
------
Batch ID: batch_674cec1cbee48190929cac2a3f8d8035
Status: completed
Created At: 1733094428
input_file_id: file-GtaQcyhursUmMzCrwZqcg4
output_file_id: file-KyBP7HotL1FCfyyBzmSNNi
------
Batch ID: batch_674cea18f7308190b8434a31c44c742a
Status: completed
Created At: 1733093913
input_file_id: file-KWWGExKBdBqT71yvAjLTeX
output_file_id: file-Tbb5nPoCtGkCs5ZFdmCJiE
------
Batch ID: batch_674ce81a4e2881908cf62316d25fc421
Status: completed
Created At: 1733093402
input_file_id: file-CY5QRmsdwf4qWKdqC11o29
output_file_id: file-UKx4DD3UPzxcZtpUA8pq5C
------
Batch ID: batch_674ce67a7534819083224a4590f28ce1
Status: completed
Created At: 1733092986
input_file_id: file-DQqHndwKV5tV7UTNXfhvwL
output_file_id: file-LLtT841NgCFziQHXtdY75h
------
