In [11]:
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

PROJECT_ROOT = Path(os.getenv("PROJECT_ROOT")).resolve() # type: ignore
MODEL_ROOT = Path(os.getenv("MODEL_ROOT")).resolve() # type: ignore
DATA_ROOT = Path(os.getenv("DATA_ROOT")).resolve() # type: ignore
CONFIG_ROOT = Path(os.getenv("CONFIG_ROOT")).resolve() # type: ignore
SRC_ROOT = Path(os.getenv("SRC_ROOT")).resolve() # type: ignore

os.environ["CUDA_VISIBLE_DEVICES"] = "5"
sys.path.append(str(SRC_ROOT / 'prepare_data'))

from openai import OpenAI

from datasets import Dataset
import json
from utils.utility import *

from omegaconf import OmegaConf

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
config = OmegaConf.load(CONFIG_ROOT / input("input configuration path: "))

In [4]:
gen_filename = input("input generation filename: ")
gen_output_path = DATA_ROOT / config.dataset_output_dir / gen_filename
print(f"Loading generated ouputs from {str(gen_output_path)}...")
with open(gen_output_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)
dataset = Dataset.from_dict(dataset)
print("Loaded successfully")

Loading generated ouputs from /src/gs25009/LLM_DAG_ALLIGN/dataset/preprocessed/final_generated.json...
Loaded successfully


In [None]:
import json
import regex as re

pattern = r"Preferred:\s*[\"']?([12])[\"']?"

def generate_comparisons(dataset: Dataset) -> list[dict]:
    pairs = []
    for k, example in enumerate(dataset):
        prompt = example['prompt'] # type: ignore
        ref = ""
        summaries = example['summaries'] # type: ignore
        
        for i, y1 in enumerate(summaries):
            for j, y2 in enumerate(summaries):
                if i < j:
                    pairs.append({
                        'prompt': prompt,
                        'y1': y1,
                        'y2': y2,
                        'ref': ref,
                        'meta': f"{k}, {i}, {j}"
                    })
    
    return pairs

pairs = generate_comparisons(dataset)

def _parse_output_line(line: str) -> tuple[int, int | None]:
    obj = json.loads(line)
    idx = int(obj.get("custom_id"))  # came from compare_batch_0
    body = obj.get("response", {}).get("body", {})
    output = body.get("output", [])[1].get("content", "")[0]
    if not output:
        return idx, None
    output_text = output.get("text", "")
    match = re.search(pattern, output_text)
    if not match:
        return idx, None
    # Map '1' -> 0 (y1), '2' -> 1 (y2)
    return idx, (0 if match.group(1) == "1" else 1)

def compare_batch_2() -> list[int | None]:
    result: list[int | None] = [None for _ in range(len(pairs))]

    b = batch

    if getattr(b, "status", None) == "completed" and getattr(b, "output_file_id", None):
        content_resp = client.files.content(str(b.output_file_id))
        # Support both text attribute and binary stream
        data = getattr(content_resp, "text", None)
        if data is None:
            # Assume file-like stream with .read()
            raw = content_resp.read()
            if isinstance(raw, bytes):
                data = raw.decode("utf-8", errors="ignore")
            else:
                data = str(raw)
        for raw_line in data.splitlines():
            if not raw_line.strip():
                continue
            idx, pref = _parse_output_line(raw_line)
            if 0 <= idx < len(result):
                result[idx] = pref

    return result

In [12]:
from preference_builders import CachedPreferenceScorer
a = CachedPreferenceScorer(str(DATA_ROOT / "preprocessed" / "final_comparisons.jsonl"))

In [20]:
b = []
for i in range(10):
    g = [[] for _ in range(10)]
    for j in range(10):
        for k in range(10):
            if j >= k: 
                continue
            comp = a.compare("", "", "", "", f"{i}, {j}, {k}")
            if comp:
                g[k].append(j)
            else:
                g[j].append(k)
    b.append(g)

In [None]:
def has_cycle(adj):
    """
    adj: 리스트 기반 인접리스트. adj[u] = u에서 나가는 간선의 목적지 리스트
    사이클 있으면 True, 없으면 False
    """
    n = len(adj)
    WHITE, GRAY, BLACK = 0, 1, 2
    color = [WHITE] * n

    def dfs(u):
        color[u] = GRAY
        for v in adj[u]:
            if color[v] == GRAY:   # 백엣지 → 사이클
                return True
            if color[v] == WHITE and dfs(v):
                return True
        color[u] = BLACK
        return False

    for u in range(n):
        if color[u] == WHITE and dfs(u):
            return True
    return False

def find_cycle(adj):
    """
    adj: 인접 리스트 (리스트 기반), adj[u] = u -> v 간선 리스트
    사이클 있으면 그 경로(list)를 반환, 없으면 None 반환
    """
    n = len(adj)
    WHITE, GRAY, BLACK = 0, 1, 2
    color = [WHITE] * n
    parent = [-1] * n
    cycle = []

    def dfs(u):
        nonlocal cycle
        color[u] = GRAY
        for v in adj[u]:
            if color[v] == WHITE:
                parent[v] = u
                if dfs(v):
                    return True
            elif color[v] == GRAY:  # 사이클 발견
                # v → ... → u → v 사이클 복원
                cycle = [v]
                x = u
                while x != v:
                    cycle.append(x)
                    x = parent[x]
                cycle.append(v)
                cycle.reverse()
                return True
        color[u] = BLACK
        return False

    for u in range(n):
        if color[u] == WHITE and dfs(u):
            return cycle
    return None

In [None]:
from graphviz import Digraph

for i, adj in enumerate(b):
    dot = Digraph()

    for u, nbrs in enumerate(adj):
        for v in nbrs:
            dot.edge(str(u), str(v))

    dot.render(f"graph{i}", format="png", view=False)  # graph.png 생성

In [None]:
with open(str(DATA_ROOT / "kk.jsonl")) as f:
    text = f.read()
a = text.split('"output_tokens": ')
sum = 0
for i, line in enumerate(a):
    if i == 0:
        continue
    if i < 10:
        print(line.split(",")[0])
    sum += int(line.split(",")[0])

print(sum / 1024)

In [3]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
with open(str(DATA_ROOT / "preprocessed" / "request_pairwise_openai_2025-09-02_21-16-06" / "223_2025-09-02_21-16-17.jsonl")) as f:
    tokens = encoding.encode(f.read())
print(len(tokens))

1018404
