In [1]:
# imports
import os
import json

import httpx
import pandas as pd
from openai import OpenAI


In [2]:
# path
PROJECT_PATH = "/home/alex/paper-2025-anonymous-submission/"

In [3]:
# config

with open(os.path.join(PROJECT_PATH, "api_credentials.json")) as f:
    configs = json.load(f)

proxies=dict(
    http=f"socks5://{configs['proxy_login']}:{configs['proxy_password']}@{configs['proxy_ip']}",
    https=f"socks5://{configs['proxy_login']}:{configs['proxy_password']}@{configs['proxy_ip']}"
)

client_example = httpx.Client(proxy=proxies["https"])
print(client_example.get("http://jsonip.com").text)

{"ip":"80.242.58.96"}


In [4]:
# model config and client

OPENAI_MOEDL = "gpt-4o-2024-08-06"
OPENAI_MOEDL_TEMPERATURE = 0.1

TASK_FILE_NAME = "wordplay_detection_gpt_4o_tasks_extended.json"
TASK_FILE_NAME_PATH = os.path.join(PROJECT_PATH, f"Data/openai_batch_task/{TASK_FILE_NAME}")

TASK_RESULT_FILE_NAME = "wordplay_detection_gpt_4o_predictions_extended.json"
TASK_RESULT_FILE_NAME_PATH = os.path.join(PROJECT_PATH, f"Data/predictions/{TASK_RESULT_FILE_NAME}")


In [5]:
client = OpenAI(api_key=configs["openai_token"] , http_client=client_example)

In [6]:
# load data

df = pd.read_json(
    "/home/alex/paper-2025-anonymous-submission/Data/processed_data/dataset_wordplay_detection_propmts_extended.json",
    orient="index"
)

In [7]:
df.head(2)

Unnamed: 0,user_prompt,system_prompt
0,Заголовок новости: Комфортная среда. Cодержани...,Присутствует ли в заголовке новости игра слов?...
1,Заголовок новости: Свинина стушевалась. Cодерж...,Присутствует ли в заголовке новости игра слов?...


In [9]:
# generate task file

tasks = []
for index, row in df.iterrows():
    task_id = index
    task = {
        "custom_id": f"task-{task_id}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": OPENAI_MOEDL,
            "temperature": OPENAI_MOEDL_TEMPERATURE,
            "messages": [
                {
                    "role": "system",
                    "content": row["system_prompt"]
                },
                {
                    "role": "user",
                    "content": row["user_prompt"]
                }
            ],
        }
    }
    tasks.append(task)

with open(TASK_FILE_NAME_PATH, 'w') as file:
    for obj in tasks:
        file.write(
            json.dumps(obj) + '\n'
        )

In [10]:
# run tasks

batch_file = client.files.create(
  file=open(TASK_FILE_NAME_PATH, "rb"),
  purpose="batch"
)

batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [15]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job.status)
print(batch_job.request_counts)

completed
BatchRequestCounts(completed=2500, failed=0, total=2500)


In [16]:
# save results

result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
result = str(result, "utf-8").split("\n")
result = result[:-1]
assert len(df) == len(result)
result = [json.loads(t) for t in result]

df_results = pd.DataFrame()
df_results["json_response"] = result

df_results["task_id"] = df_results["json_response"].apply(lambda x: x["custom_id"])
df_results["task_id"] = df_results["task_id"].apply(lambda x: x.split("-")[1])
df_results["gpt4o_yes_no"] = df_results["json_response"].apply(lambda x: x["response"]["body"]["choices"][0]["message"]["content"])


df_results.to_json(
    TASK_RESULT_FILE_NAME_PATH, orient="index"
)

In [17]:
df_results

Unnamed: 0,json_response,task_id,gpt4o_yes_no
0,{'id': 'batch_req_679779eee74c8190b840d4b92f50...,0,нет
1,{'id': 'batch_req_679779eefd548190924faf63afa7...,1,да
2,{'id': 'batch_req_679779ef155881909d67c0621d40...,2,да
3,{'id': 'batch_req_679779ef2c148190a374f73b7b29...,3,да
4,{'id': 'batch_req_679779ef41888190a3b9b01c7b9e...,4,да
...,...,...,...
2495,{'id': 'batch_req_67977ad3ce4481909a1be6941fde...,2495,да
2496,{'id': 'batch_req_67977ad3e39881908f79a458d08a...,2496,да
2497,{'id': 'batch_req_67977ad3fa54819087e0ebccc11c...,2497,да
2498,{'id': 'batch_req_67977ad40fbc8190819845318658...,2498,да


In [18]:
df_results["gpt4o_yes_no"].value_counts()

да         1665
нет         813
не знаю      13
Да.           5
Нет.          4
Name: gpt4o_yes_no, dtype: int64