In [1]:
# imports
import os
import json

import httpx
import pandas as pd
from openai import OpenAI


In [2]:
# path
PROJECT_PATH = "/home/alex/paper-2025-anonymous-submission/"

In [3]:
# config

with open(os.path.join(PROJECT_PATH, "api_credentials.json")) as f:
    configs = json.load(f)

proxies=dict(
    http=f"socks5://{configs['proxy_login']}:{configs['proxy_password']}@{configs['proxy_ip']}",
    https=f"socks5://{configs['proxy_login']}:{configs['proxy_password']}@{configs['proxy_ip']}"
)

client_example = httpx.Client(proxy=proxies["https"])
print(client_example.get("http://jsonip.com").text)

{"ip":"80.242.58.96"}


In [4]:
# model config and client

OPENAI_MOEDL = "gpt-4o-2024-08-06"
OPENAI_MOEDL_TEMPERATURE = 0.1

TASK_FILE_NAME = "wordplay_interpretation_gpt_4o_tasks_extended_ria.json"
TASK_FILE_NAME_PATH = os.path.join(PROJECT_PATH, f"Data/openai_batch_task/{TASK_FILE_NAME}")

TASK_RESULT_FILE_NAME = "wordplay_interpretation_gpt_4o_predictions_extended_ria.json"
TASK_RESULT_FILE_NAME_PATH = os.path.join(PROJECT_PATH, f"Data/predictions/{TASK_RESULT_FILE_NAME}")


In [5]:
client = OpenAI(api_key=configs["openai_token"] , http_client=client_example)

In [6]:
# load data

df = pd.read_json(
    "/home/alex/paper-2025-anonymous-submission/Data/processed_data/ria_interpretation_extended.json",
    orient="index"
)

In [7]:
df.head(2)

Unnamed: 0,user_prompt,system_prompt
0,Заголовок новости: украинская люстрация: жертв...,\nПроанализируй заголовок новости в контексте ...
1,Заголовок новости: цена на нефть марки brent п...,\nПроанализируй заголовок новости в контексте ...


In [8]:
# generate task file

tasks = []
for index, row in df.iterrows():
    task_id = index
    task = {
        "custom_id": f"task-{task_id}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": OPENAI_MOEDL,
            "temperature": OPENAI_MOEDL_TEMPERATURE,
            "messages": [
                {
                    "role": "system",
                    "content": row["system_prompt"]
                },
                {
                    "role": "user",
                    "content": row["user_prompt"]
                }
            ],
        }
    }
    tasks.append(task)

with open(TASK_FILE_NAME_PATH, 'w') as file:
    for obj in tasks:
        file.write(
            json.dumps(obj) + '\n'
        )

In [9]:
# run tasks

batch_file = client.files.create(
  file=open(TASK_FILE_NAME_PATH, "rb"),
  purpose="batch"
)

batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [40]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job.status)
print(batch_job.request_counts)

completed
BatchRequestCounts(completed=1000, failed=0, total=1000)


In [41]:
# save results

result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content
result = str(result, "utf-8").split("\n")
result = result[:-1]
assert len(df) == len(result)
result = [json.loads(t) for t in result]

df_results = pd.DataFrame()
df_results["json_response"] = result

df_results["task_id"] = df_results["json_response"].apply(lambda x: x["custom_id"])
df_results["task_id"] = df_results["task_id"].apply(lambda x: x.split("-")[1])
df_results["gpt4o_explain"] = df_results["json_response"].apply(lambda x: x["response"]["body"]["choices"][0]["message"]["content"])


df_results.to_json(
    TASK_RESULT_FILE_NAME_PATH, orient="index"
)

In [42]:
df_results

Unnamed: 0,json_response,task_id,gpt4o_explain
0,{'id': 'batch_req_67a5edd5b6148190bf09e2f48d7a...,0,в заголовке нет игры слов.
1,{'id': 'batch_req_67a5edd5c8d48190a1426e71028d...,1,в заголовке нет игры слов
2,{'id': 'batch_req_67a5edd5e9c48190ac024113e362...,2,в заголовке нет игры слов.
3,{'id': 'batch_req_67a5edd5fb20819083d54e5b99ac...,3,в заголовке нет игры слов.
4,{'id': 'batch_req_67a5edd60e788190bfc92e746ab4...,4,в заголовке нет игры слов.
...,...,...,...
995,{'id': 'batch_req_67a5eea4264481909b41ae6ce706...,995,в заголовке нет игры слов.
996,{'id': 'batch_req_67a5eea437a08190962f6218ae07...,996,в заголовке нет игры слов.
997,{'id': 'batch_req_67a5eea44bc48190ab040cb0ce8e...,997,в заголовке нет игры слов.
998,{'id': 'batch_req_67a5eea475d48190b151b866ebb4...,998,в заголовке нет игры слов.


In [43]:
df_results["gpt4o_explain"].value_counts()

в заголовке нет игры слов.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    904
в заголовке нет игры слов                                                                                                                                                                             