In [3]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT")).resolve() # type: ignore
MODEL_ROOT = Path(os.getenv("MODEL_ROOT")).resolve() # type: ignore
DATA_ROOT = Path(os.getenv("DATA_ROOT")).resolve() # type: ignore
CONFIG_ROOT = Path(os.getenv("CONFIG_ROOT")).resolve() # type: ignore
SRC_ROOT = Path(os.getenv("SRC_ROOT")).resolve() # type: ignore

os.environ["CUDA_VISIBLE_DEVICES"] = "5"
sys.path.append(str(SRC_ROOT))

from openai import OpenAI

from datasets import Dataset
import json
from utils.utility import *

from omegaconf import OmegaConf

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [9]:
batch = client.batches.retrieve("batch_68b6866b5cd081909b0f206ad0039d0c")
print(batch)

Batch(id='batch_68b6866b5cd081909b0f206ad0039d0c', completion_window='24h', created_at=1756792427, endpoint='/v1/responses', input_file_id='file-NP7JQzHQ5mV3LGwGGJgGzT', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1756792693, error_file_id=None, errors=None, expired_at=None, expires_at=1756878827, failed_at=None, finalizing_at=1756792618, in_progress_at=1756792488, metadata={}, output_file_id='file-SnPwG2xtu1KJABB4KpZykf', request_counts=BatchRequestCounts(completed=1024, failed=0, total=1024))


In [10]:
config = OmegaConf.load(CONFIG_ROOT / input("input configuration path: "))

In [11]:
gen_filename = input("input generation filename: ")
gen_output_path = DATA_ROOT / config.output_dir / gen_filename
print(f"Loading generated ouputs from {str(gen_output_path)}...")
with open(gen_output_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)
dataset = Dataset.from_dict(dataset)
print("Loaded successfully")

Loading generated ouputs from /src/gs25009/LLM_DAG_ALLIGN/dataset/preprocessed/generated_pairwise_openai_2025-08-27_06-03-12.json...
Loaded successfully


In [12]:
import json
import regex as re

pattern = r"Preferred:\s*[\"']?([12])[\"']?"

def generate_comparisons(dataset: Dataset) -> list[dict]:
    pairs = []
    for k, example in enumerate(dataset):
        prompt = example['prompt'] # type: ignore
        ref = ""
        summaries = example['summaries'] # type: ignore
        
        for i, y1 in enumerate(summaries):
            for j, y2 in enumerate(summaries):
                if i < j:
                    pairs.append({
                        'prompt': prompt,
                        'y1': y1,
                        'y2': y2,
                        'ref': ref,
                        'meta': f"{k}, {i}, {j}"
                    })
    
    return pairs

pairs = generate_comparisons(dataset)

def _parse_output_line(line: str) -> tuple[int, int | None]:
    obj = json.loads(line)
    idx = int(obj.get("custom_id"))  # came from compare_batch_0
    body = obj.get("response", {}).get("body", {})
    output = body.get("output", [])[1].get("content", "")[0]
    if not output:
        return idx, None
    output_text = output.get("text", "")
    match = re.search(pattern, output_text)
    if not match:
        return idx, None
    # Map '1' -> 0 (y1), '2' -> 1 (y2)
    return idx, (0 if match.group(1) == "1" else 1)

def compare_batch_2() -> list[int | None]:
    result: list[int | None] = [None for _ in range(len(pairs))]

    b = batch

    if getattr(b, "status", None) == "completed" and getattr(b, "output_file_id", None):
        content_resp = client.files.content(b.output_file_id)
        # Support both text attribute and binary stream
        data = getattr(content_resp, "text", None)
        if data is None:
            # Assume file-like stream with .read()
            raw = content_resp.read()
            if isinstance(raw, bytes):
                data = raw.decode("utf-8", errors="ignore")
            else:
                data = str(raw)
        for raw_line in data.splitlines():
            if not raw_line.strip():
                continue
            idx, pref = _parse_output_line(raw_line)
            if 0 <= idx < len(result):
                result[idx] = pref

    return result

In [None]:
res = compare_batch_2()

[0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1]


In [None]:
cnt = 0
for a in res:
    if a is None:
        break
    if a is not None:
        cnt += 1
print(cnt)

1024


In [18]:
filename = "gamja.json"
output_path = DATA_ROOT / config.output_dir / filename
        
print(f"Saving result to {str(output_path)}...")
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(str(output_path), "w", encoding="utf-8") as f:
    for pair, compare in zip(pairs, res):
        line = json.dumps({"id": pair["meta"], "result": compare}, ensure_ascii=False)
        f.write(line + "\n")
print("Saved successfully.")

Saving result to /src/gs25009/LLM_DAG_ALLIGN/dataset/preprocessed/gamja.json...
Saved successfully.


In [None]:
# 0~23까지 처리됨
groups = [[] for _ in range(24)]
for pref, pair in zip(res, pairs):
    if pref is None:
        continue
    k = int(pair['meta'].split(",")[0])
    if k < 24:
        groups[k].append((pair, pref))

for group in groups:
    print(len(group))

45
45
45
45
45
45
45
45
45
45
45
45
45
0
45
45
45
45
45
45
45
45
0
45


In [35]:
def has_cycle(adj):
    """
    adj: 리스트 기반 인접리스트. adj[u] = u에서 나가는 간선의 목적지 리스트
    사이클 있으면 True, 없으면 False
    """
    n = len(adj)
    WHITE, GRAY, BLACK = 0, 1, 2
    color = [WHITE] * n

    def dfs(u):
        color[u] = GRAY
        for v in adj[u]:
            if color[v] == GRAY:   # 백엣지 → 사이클
                return True
            if color[v] == WHITE and dfs(v):
                return True
        color[u] = BLACK
        return False

    for u in range(n):
        if color[u] == WHITE and dfs(u):
            return True
    return False

def find_cycle(adj):
    """
    adj: 인접 리스트 (리스트 기반), adj[u] = u -> v 간선 리스트
    사이클 있으면 그 경로(list)를 반환, 없으면 None 반환
    """
    n = len(adj)
    WHITE, GRAY, BLACK = 0, 1, 2
    color = [WHITE] * n
    parent = [-1] * n
    cycle = []

    def dfs(u):
        nonlocal cycle
        color[u] = GRAY
        for v in adj[u]:
            if color[v] == WHITE:
                parent[v] = u
                if dfs(v):
                    return True
            elif color[v] == GRAY:  # 사이클 발견
                # v → ... → u → v 사이클 복원
                cycle = [v]
                x = u
                while x != v:
                    cycle.append(x)
                    x = parent[x]
                cycle.append(v)
                cycle.reverse()
                return True
        color[u] = BLACK
        return False

    for u in range(n):
        if color[u] == WHITE and dfs(u):
            return cycle
    return None

In [39]:
result = []
adj = 0
for group in groups:
    if len(group) == 0:
        continue
    prompt = group[0][0]['prompt']
    max_idx = 0

    for pair, pref in group:
        _, i, j = map(int, pair['meta'].split(", "))
        max_idx = max(max_idx, i, j)

    summaries = [""] * (max_idx + 1)
    graph = [[] for _ in range(max_idx + 1)]

    for pair, pref in group:
        _, i, j = map(int, pair['meta'].split(", "))
        y1, y2 = pair['y1'], pair['y2']
        max_idx = max(max_idx, i, j)
        summaries[i] = y1
        summaries[j] = y2
        if pref == 0:
            graph[i].append(j)
        else:
            graph[j].append(i)

    print(graph)
    print(has_cycle(graph))
    if has_cycle(graph):
        print(find_cycle(graph))
        adj = graph

[[1, 2, 3, 4, 7, 8, 9], [3, 7, 8, 9], [1, 3, 4, 7, 8, 9], [9], [1, 3, 7, 9], [0, 1, 2, 3, 4, 7, 8, 9], [0, 1, 2, 3, 4, 5, 7, 8, 9], [3, 8, 9], [3, 4, 9], []]
True
[1, 7, 8, 4, 1]
[[1, 5], [4, 5, 7], [0, 1, 4, 5, 7, 9], [0, 1, 2, 4, 5, 7, 8, 9], [0, 5, 7, 8], [], [0, 1, 2, 3, 4, 5, 7, 8, 9], [0, 5], [0, 1, 2, 5, 7], [0, 1, 4, 5, 7, 8]]
True
[0, 1, 4, 0]
[[1, 2, 3, 4, 7, 8], [2, 3, 4, 8], [3, 4, 8], [], [3, 8], [0, 1, 2, 3, 4, 6, 7, 8], [0, 1, 2, 3, 4, 7, 8], [1, 2, 3, 4], [3, 7], [0, 1, 2, 3, 4, 5, 6, 7, 8]]
True
[1, 2, 4, 8, 7, 1]
[[6], [0, 4, 5, 6, 8, 9], [0, 1, 4, 5, 6, 8, 9], [0, 1, 2, 4, 5, 6, 8, 9], [0, 5, 6, 8, 9], [0, 6, 8], [], [0, 1, 2, 3, 4, 5, 6, 8, 9], [0, 6], [0, 5, 6, 8]]
False
[[1, 3, 4, 5, 6, 7, 8], [3, 5, 6, 7, 8], [0, 1, 3, 4, 5, 6, 7, 8, 9], [4, 5, 6, 7, 8], [1, 5, 6, 7, 8], [6, 7, 8], [7, 8], [], [7], [0, 1, 3, 4, 5, 6, 7, 8]]
True
[1, 3, 4, 1]
[[4, 5], [0, 3, 4, 5, 8, 9], [0, 1, 3, 4, 5, 6, 8, 9], [0], [3, 5, 8], [3, 6, 9], [0, 1, 3, 4, 8, 9], [0, 1, 2, 3, 4, 5, 6,

In [None]:
from graphviz import Digraph

dot = Digraph()

for u, nbrs in enumerate(adj):
    for v in nbrs:
        dot.edge(str(u), str(v))

dot.render("graph", format="png", view=True)  # graph.png 생성

'graph.png'

Error: no "view" rule for type "image/png" passed its test case
       (for more information, add "--debug=1" on the command line)


In [6]:
with open(str(DATA_ROOT / "kk.jsonl")) as f:
    text = f.read()
a = text.split('"output_tokens": ')
sum = 0
for i, line in enumerate(a):
    if i == 0:
        continue
    if i < 10:
        print(line.split(",")[0])
    sum += int(line.split(",")[0])

print(sum / 1024)

66
75
65
49
58
76
59
44
51
65.5732421875
