In [26]:
import os
import json
import random
from dotenv import load_dotenv
from pydantic import BaseModel, ValidationError
from typing import List
from openai import OpenAI
import re

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [27]:
##################################################
# 1. Pydantic model for the paper review
##################################################

class PaperReview(BaseModel):
    Soundness: int
    Presentation: int
    Contribution: int
    Rating: int
    Confidence: int
    Strengths: str
    Weaknesses: str
    Questions: str

In [28]:
##################################################
# 2. Helper functions
##################################################

def load_jsonl(file_path: str) -> List[dict]:
    """
    Loads a JSONL file and returns a list of dictionaries.
    """
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data.append(json.loads(line))
    return data

def load_json(file_path: str) -> List[dict]:
    """
    Loads a standard JSON (not JSONL) file; returns a list or dict
    depending on structure. Adjust as needed if your file structure differs.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def fetch_example_by_paper_id(examples: List[dict], paper_id: str) -> dict:
    """
    Given a list of examples that presumably have {"paper_id": <str>, "messages": [...]},
    returns the one matching the given paper_id. If not found, returns None.
    """
    for ex in examples:
        # Each ex presumably has some structure that includes
        # the "paper_id" in ex itself or inside ex["messages"] ...
        #
        # If your data includes paper_id outside of ex["messages"],
        # you'd do something like ex["paper_id"] == paper_id.
        #
        # Otherwise, you need to see how you stored the paper_id
        # in the JSONL. For this example, let's assume
        # ex["paper_id"] is at the top level.
        if ex.get("paper_id") == paper_id:
            return ex
    return None

def parse_paper_review(response_content: str) -> PaperReview:
    """
    Takes the raw string content from a model's response,
    parses it as JSON, and then validates against PaperReview.
    """
    try:
        parsed_dict = json.loads(response_content)
        return PaperReview(**parsed_dict)
    except json.JSONDecodeError as e:
        print("Invalid JSON returned by the model:", e)
        raise
    except ValidationError as ve:
        print("Pydantic validation error:", ve)
        raise

def extract_json_string(raw_content: str) -> str:
    match = re.search(r"```(?:json)?(.*?)```", raw_content, flags=re.DOTALL)
    if match:
        # Return only the inner portion of the code fence
        return match.group(1).strip()
    else:
        return raw_content.strip()

def build_few_shot_context(
    examples: List[dict],
    shot_ids: List[str],
    new_example: dict
) -> List[dict]:
    """
    Creates a 'few-shot' prompt by concatenating the
    demonstration examples (system + user + assistant from each shot)
    followed by the new system + user from the new_example.
    
    Returns a messages list that you can pass to the chat completion.
    """
    # 1) Start with an empty list
    messages = []

    # 2) For each demonstration example (one-shot or multi-shot),
    #    add system, user, and assistant roles as a demonstration
    for shot_id in shot_ids:
        demo_ex = fetch_example_by_paper_id(examples, shot_id)
        if not demo_ex:
            print(f"WARNING: No example found for shot ID {shot_id}")
            continue
        
        # Add demonstration messages (full conversation: system, user, assistant)
        # Typically you want to instruct the model that the assistant message
        # is an "example" of how to respond.
        for msg in demo_ex["messages"]:
            # We often transform the roles slightly for few-shot:
            # e.g. "system" -> "system", "user" -> "user", "assistant" -> "assistant"
            # or sometimes we embed them in a system message. 
            # For simplicity, let's keep them as separate turns.
            messages.append(msg)

    # 3) Finally, add the "system" + "user" from the new_example,
    #    but do NOT include the new_example's "assistant" because
    #    we want the model to generate a new answer.
    #    new_example["messages"][0] is system
    #    new_example["messages"][1] is user
    #    new_example["messages"][2] is assistant (the 'gold' we won't include)
    new_system_msg = new_example["messages"][0]
    new_user_msg   = new_example["messages"][1]

    messages.append(new_system_msg)
    messages.append(new_user_msg)

    return messages

In [29]:
# load data
test_2024 = load_jsonl("data/test_data_2024_summary_prompts.jsonl")
test_2025 = load_jsonl("data/test_data_2025_summary_prompts.jsonl")

# Suppose we also have "unused_data_2024.json" with additional examples
with open("data/unused_data_2024.json", "r", encoding="utf-8") as f:
    unused_data_2024 = json.load(f)

In [30]:
# 3.2) Zero Shot Example
print("=== ZERO SHOT EXAMPLE ===")
# We'll pick one from the test_2024 data, e.g. the first
zero_shot_example = test_2024[0]
zero_shot_messages = zero_shot_example["messages"][:2]  # system, user
try:
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=zero_shot_messages
    )
    raw_content = completion.choices[0].message.content
    json_str = extract_json_string(raw_content)
    review = parse_paper_review(json_str)
    print("Zero shot review:\n", review)
except Exception as e:
    print("Zero-shot error:", e)

=== ZERO SHOT EXAMPLE ===
Zero shot review:
 Soundness=4 Presentation=4 Contribution=4 Rating=8 Confidence=4 Strengths="The paper effectively integrates two NAS methodologies to leverage their respective strengths: weight-entanglement's reduced memory usage and gradient-based optimization's performance. The comprehensive experiments and benchmarks provide strong empirical validation for the proposed approach." Weaknesses='The integration of TangleNAS into other existing frameworks and its application to real-world problems might require additional investigation. The practical impact in terms of training time, rather than memory or accuracy alone, could be further explored.' Questions='How does TangleNAS handle potential compatibility issues between different operation types when superimposing weights with zero-padding? What are the specific limitations of TangleNAS in terms of scalability to even larger architecture spaces?'


In [31]:
# 3.3) One Shot Example
print("\n=== ONE SHOT EXAMPLE ===")
# We use the paper_id "o1TKGCrSL7" from the unused_data_2024 as the demonstration
# Then we also pick a new test example to evaluate zero shot on
# In a typical approach, "o1TKGCrSL7" is the demonstration, and the "new" test example
# is the query. For clarity, let's just pick the second item in test_2024 as the query

new_example_for_query = test_2024[1]
one_shot_messages = build_few_shot_context(
    examples=unused_data_2024,
    shot_ids=["o1TKGCrSL7"],   # single demonstration ID
    new_example=new_example_for_query
)


=== ONE SHOT EXAMPLE ===


KeyError: 'messages'

In [None]:
try:
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=one_shot_messages
    )
    raw_content = completion.choices[0].message.content
    json_str = extract_json_string(raw_content)
    review = parse_paper_review(json_str)
    print("One shot review:\n", review)
except Exception as e:
    print("One-shot error:", e)

In [None]:
# 3.4) Few Shot Example
print("\n=== FEW SHOT EXAMPLE ===")
# We'll use the paper_id "o1TKGCrSL7", "JzvIWvC9MG", and "5rrYpa2vts" from unused_data_2024
# as the demonstrations. Then pick another new test example as the query.
if len(test_2024) > 2:
    new_example_for_query = test_2024[2]
    few_shot_messages = build_few_shot_context(
        examples=unused_data_2024,
        shot_ids=["o1TKGCrSL7", "JzvIWvC9MG", "5rrYpa2vts"],
        new_example=new_example_for_query
    )
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=few_shot_messages
        )
        raw_content = completion.choices[0].message.content
        json_str = extract_json_string(raw_content)
        review = parse_paper_review(json_str)
        print("Few shot review:\n", review)
    except Exception as e:
        print("Few-shot error:", e)
else:
    print("Not enough test_2024 data to do few-shot example.")