# Batch Processing

In [41]:
from python.open_ai.functions import (
    load_json, prepare_batch_files,
    upload_batch_file, create_batch_job, get_batch_results
)
from math import ceil
import os
from collections import defaultdict
from pathlib import Path
import json
import uuid
import glob

In [21]:
directory_path = f"raw/"
result_dir_path = Path(f"outputs/")
result_dir_path.mkdir(parents=True, exist_ok=True)

# Load all data from files in the directory
data_list = defaultdict(list)
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    if os.path.exists(os.path.join(result_dir_path, filename)):
        continue

    raw_data = load_json(file_path)
    for i in range(ceil(len(raw_data) / 8)):
        start = i * 8
        end = start + 8
        window = raw_data[start:end]
        data_list[filename].append(window)

# Provide instructions for ChatGPT
instructions = (
    "Feldolgozd az alábbi adatokat egy strukturált formátumhoz ezekkel az oszlopokkal: "
    "brand, product_name, price, currency, quantity, measurement, product_category, sub_categories. "
    "Adj hozzá releváns alkategóriákat a névben található kulcsszavak alapján. "
    "Csak JSON szöveges választ adj!. A sorokat magyar nyelven add meg."
)

# Prepare batch file
files = prepare_batch_files(data_list, instructions)
files

['inputs/batch_input_H8-2-1.json.jsonl']

In [23]:
batch_jobs = []
# Upload batch file
for file in files:
    batch_file = upload_batch_file(file)
    print(f"Batch file uploaded with ID: {batch_file.id}")

    # Create batch job
    batch_job = create_batch_job(batch_file.id)
    print(f"Batch job created with ID: {batch_job.id}")
    batch_jobs.append(batch_job.id)

    # Wait for batch to complete and retrieve results (add a polling mechanism if needed)
    print(get_batch_results(batch_job.id))

Batch file uploaded with ID: file-S3fukTyMLUTAAEGFHupdFD
Batch job created with ID: batch_674c647bb0cc819087a4c96937551491
Batch job status: in_progress


In [33]:
batch_jobs =["batch_674c647bb0cc819087a4c96937551491"]

In [34]:
import json
# Specify the file name
import uuid
file_name = f"batch_jobs_{uuid.uuid4()}.json"

# Write the list to a JSON file
try:
    with open(file_name, 'w') as json_file:
        json.dump(batch_jobs, json_file, indent=4)  # Use indent for a pretty-printed JSON
    print(f"JSON file '{file_name}' created successfully!")
except Exception as e:
    print(f"An error occurred: {e}")

JSON file 'batch_jobs_9a717cb7-6973-4a8a-89d3-b5960e3f3bfb.json' created successfully!


In [42]:
get_batch_results("batch_674c647bb0cc819087a4c96937551491")

'Batch job status: in_progress'

In [39]:
# Function to find the JSON file with the specific naming pattern
def find_batch_jobs_file():
    # Use a glob pattern to find files that match "batch_jobs_<UUID>.json"
    json_files = glob.glob("batch_jobs_*.json")
    for file in json_files:
        try:
            # Validate that the file name contains a valid UUID
            uuid_part = file.split("_")[2].split(".")[0]
            uuid.UUID(uuid_part)
            return file
        except (IndexError, ValueError):
            continue
    return None

# Load the list back from the JSON file
try:
    file_name = find_batch_jobs_file()
    if file_name:
        with open(file_name, 'r') as json_file:
            batch_jobs = json.load(json_file)
        print(f"Loaded data from '{file_name}':")
        print(batch_jobs)
    else:
        print("No valid 'batch_jobs' JSON file found.")
except Exception as e:
    print(f"An error occurred: {e}")

Loaded data from 'batch_jobs_9a717cb7-6973-4a8a-89d3-b5960e3f3bfb.json':
['batch_674c647bb0cc819087a4c96937551491']
