In [28]:
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI
import json
import requests
import time

In [20]:
# 1) Load environment variables from .env
load_dotenv()

# 2) Create OpenAI client
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    # organization="org-xxx",           # Optional, if you belong to multiple orgs
    # project="proj-xxx",              # Optional, if using project-based keys
)

# 3) Test the connection by listing available models
models = client.models.list()
models

SyncPage[Model](data=[Model(id='gpt-4o-mini-audio-preview-2024-12-17', created=1734115920, object='model', owned_by='system'), Model(id='dall-e-3', created=1698785189, object='model', owned_by='system'), Model(id='dall-e-2', created=1698798177, object='model', owned_by='system'), Model(id='gpt-4o-audio-preview-2024-10-01', created=1727389042, object='model', owned_by='system'), Model(id='gpt-4o-audio-preview', created=1727460443, object='model', owned_by='system'), Model(id='gpt-4o', created=1715367049, object='model', owned_by='system'), Model(id='o1-mini-2024-09-12', created=1725648979, object='model', owned_by='system'), Model(id='gpt-4o-mini-realtime-preview-2024-12-17', created=1734112601, object='model', owned_by='system'), Model(id='o1-preview-2024-09-12', created=1725648865, object='model', owned_by='system'), Model(id='o1-mini', created=1725649008, object='model', owned_by='system'), Model(id='o1-preview', created=1725648897, object='model', owned_by='system'), Model(id='gpt-4

In [24]:

# 3) Define a structured schema for the response
class PaperSummary(BaseModel):
    background: str
    gap: str
    hypothesis: str
    method: str
    conclusion: str

def summarize_paper_text(title_text: str, abstract_text: str, full_text: str):
    """
    Summarize the paper into four bullet points:
      1) background
      2) gap
      3) hypothesis
      4) method
      5) conclusion
    Returns a structured JSON object conforming to PaperSummary.
    """
    # 3.1) Create your prompt (system & user instructions)
    system_msg = {
        "role": "system",
        "content": f""" You are a helpful assistant that extracts a structured summary from the paper. 
                   Please return only valid JSON that matches the specified schema.

                   1. Carefully read the full text, paying particular attention to:
                    - The background or prior work that sets the stage for the research
                    - The specific gap or problem the paper addresses
                    - The hypothesis or main claim/idea the authors propose to solve that gap
                    - The method or approach (including any experimental setup, data, or algorithmic details) 
                    - The main conclusion or findings of the paper
   
                    2. The summary must be thorough enough that a reviewer could evaluate:
                    - Soundness (i.e., clarity and rigor of the methods)
                    - Presentation (writing clarity and structure)
                    - Contribution (novelty and significance) and have enough detail to assess strengths, weaknesses, and potential questions.

                    3. Do NOT omit important details from the method—like dataset, experimental steps, or critical algorithmic components—if they are present in the paper.

                    4. You are NOT an interpreter or critic of the text. You must produce an **objective** and **factual** summary. Avoid personal opinions or expansions beyond the paper’s content.

                    5. If something is unclear or not stated, do NOT fabricate. 
                    - Instead note briefly “not explicitly stated” or “insufficient details” within that field.
                   """
    }
    user_msg = {
        "role": "user",
        "content": f"""
        Title: {title_text}
        Abstract: {abstract_text}
        Full Text: {full_text}
        
        Provide:
        - background
        - gap
        - hypothesis
        - method
        - conclusion
        """
    }

    # 3.2) Make the completion request with the new parse(...) method
    # IMPORTANT: Use a model that supports structured output. 
    # e.g. "o1-2024-12-17", "gpt-4o-2024-08-06", or "o3-mini-2025-1-31", etc.
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[system_msg, user_msg],
        response_format=PaperSummary  # Tells the model we want a JSON object matching PaperSummary
    )

    # 3.3) The parsed result is accessible via completion.choices[0].message.parsed
    paper_summary = completion.choices[0].message.parsed
    return paper_summary


if __name__ == "__main__":
    # Example usage
    title = "A Novel Method in Quantum Computing"
    abstract = "In this paper, we explore a new approach to quantum entanglement."
    full_text = "Full text containing detailed methodology, results, etc..."

    summary = summarize_paper_text(title, abstract, full_text)


In [25]:
print(summary)  # This will print a PaperSummary object
# You can access the fields individually:
print("Background: ", summary.background)
print("Gap:        ", summary.gap)
print("Hypothesis: ", summary.hypothesis)
print("Method:     ", summary.method)
print("Conclusion: ", summary.conclusion)

background='Entanglement is a fundamental concept in quantum mechanics which plays a crucial role in quantum computing and information processing. Previous research has focused on various methods to generate and manipulate entangled states, as these are key for achieving quantum supremacy and practical quantum technologies.' gap='Despite progress in the creation and manipulation of entangled states, existing methods are often limited in scalability and efficiency. Many approaches struggle with maintaining coherence and stability over time, which are vital for practical applications.' hypothesis='The authors propose that a new method for generating quantum entanglement, potentially utilizing novel materials or mechanisms, could overcome current limitations and enhance scalability and efficiency in quantum computing.' method='The paper presents a theoretical framework for a new quantum entanglement generation method. The proposed approach is based on [details not explicitly stated], whic

In [33]:
##############################################################
# 2. Building the instruction prompt text
##############################################################
def build_instruction_prompt(background, gap, hypothesis, method, conclusion):
    """
    Construct the instruction prompt that includes the 4 bullet points
    plus the rating categories we need from the user.
    """
    prompt = f"""\
Background: {background}
Gap: {gap}
Hypothesis: {hypothesis}
Method: {method}
Conclusion: {conclusion}

Categories (1–5):
- Soundness: The rigor of the methods for the stated problem
- Presentation: The clarity of writing and organization
- Contribution: The paper’s novelty or added value to the domain

Additionally:
- Rating (1–10): Overall recommendation for acceptance
- Confidence (1–5): How confident the reviewer is in their assessment

Please provide:
1) Soundness
2) Presentation
3) Contribution
4) Rating
5) Confidence
6) Explanation (strengths, weaknesses)
7) Questions
"""
    return prompt

##############################################################
# 3. Constructing gold-standard response from OpenReview fields
##############################################################
def build_gold_standard_response(review_data):
    """
    Given the 'reviews' sub-fields, build the textual answer.

    We'll look for:
      - soundness
      - presentation
      - contribution
      - rating
      - confidence
      - strengths
      - weaknesses
      - questions

    The final text should match the example structure:

    Soundness: 2 (fair)
    Presentation: 2 (fair)
    ...
    Explanation:
    Strengths:
    ...
    Weaknesses:
    ...
    Questions:
    ...
    """
    # Some values are optional if not provided. We'll default them if missing.
    soundness_val = review_data.get("soundness", {}).get("value", "N/A")
    presentation_val = review_data.get("presentation", {}).get("value", "N/A")
    contribution_val = review_data.get("contribution", {}).get("value", "N/A")
    rating_val = review_data.get("rating", {}).get("value", "N/A")
    confidence_val = review_data.get("confidence", {}).get("value", "N/A")

    # For strengths/weaknesses, we might store them under "strengths" and "weaknesses"
    strengths_val = review_data.get("strengths", {}).get("value", "None given")
    weaknesses_val = review_data.get("weaknesses", {}).get("value", "None given")
    questions_val = review_data.get("questions", {}).get("value", "None")

    # Format them in the example style
    response_text = f"""Soundness: {soundness_val}
Presentation: {presentation_val}
Contribution: {contribution_val}
Rating: {rating_val}
Confidence: {confidence_val}

Explanation:
Strengths:
{strengths_val}

Weaknesses:
{weaknesses_val}

Questions:
{questions_val}
"""
    return response_text

##############################################################
# 4. Main script to process the JSON data
##############################################################
def create_instruction_finetuning_data(
    input_json_path,
    output_jsonl_path
):
    """
    1. Load the OpenReview-like JSON data.
    2. For each paper, call summarize_paper_text(...) to get 4 bullet points.
    3. Build the prompt using build_instruction_prompt(...).
    4. Build the gold response from the 'reviews' sub-fields.
    5. Store them into a JSON lines file with fields: {"prompt": "...", "response": "..."}
    """

    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    output_lines = []
    skipped_papers = []  # Track any papers that fail
    for i, (paper_id, paper_info) in enumerate(data.items(), start=1):
            
        try:
            # We can retrieve the 'title', 'abstract', 'full_text' if exist
            title_val = paper_info.get("title", {}).get("value", "")
            abstract_val = paper_info.get("abstract", {}).get("value", "")
            full_text_val = paper_info.get("full_text", {}).get("value", "")

            # Summarize with your LLM or a placeholder
            summary = summarize_paper_text(
                title_val, abstract_val, full_text_val
            )

            background = summary.background
            gap = summary.gap
            hypothesis = summary.hypothesis
            method = summary.method
            conclusion = summary.conclusion

            # Build the instruction prompt
            prompt_text = build_instruction_prompt(
                background,
                gap,
                hypothesis,
                method,
                conclusion
            )

            # Now build the gold standard response from the reviews
            reviews_info = paper_info.get("reviews", {})
            # We might just pass the entire dictionary to build_gold_standard_response
            response_text = build_gold_standard_response(reviews_info)

            # Store them in your finetuning dataset
            item = {
                "paper_id": paper_id,
                "prompt": prompt_text,
                "response": response_text
            }
            output_lines.append(item)

        except Exception as e:
            # If there's an error, skip this paper
            print(f"Error summarizing paper {paper_id}: {e}")
            print("Skipping this paper and waiting 15 seconds before continuing...")
            skipped_papers.append(paper_id)
            time.sleep(15)
            continue

        # Every 10 papers, wait 1 minute
        if i % 5 == 0:
            print(f"Processed {i} papers so far...")

    # Save to JSONL
    with open(output_jsonl_path, "w", encoding="utf-8") as out_f:
        for line_item in output_lines:
            out_f.write(json.dumps(line_item, ensure_ascii=False) + "\n")

    print(f"Created instruction-finetuning data at: {output_jsonl_path}")


##############################################################
# 5. Usage example
##############################################################
if __name__ == "__main__":
    input_json_path = "matched_papers_reviews_test.json"   # path to your input
    output_jsonl_path = "instruction_finetuning_data_test.jsonl"

    create_instruction_finetuning_data(input_json_path, output_jsonl_path)

    print("Done!")


Processed 5 papers so far...
Processed 10 papers so far...
Processed 15 papers so far...
Processed 20 papers so far...
Error summarizing paper 5KojubHBr8: Error code: 429 - {'error': {'message': 'Request too large for gpt-4o in organization org-Xftkx2RCfzMHxshpuGkJcgeX on tokens per min (TPM): Limit 30000, Requested 36042. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}
Skipping this paper and waiting 15 seconds before continuing...
Processed 25 papers so far...
Processed 30 papers so far...
Error summarizing paper OGtnhKQJms: Error code: 429 - {'error': {'message': 'Request too large for gpt-4o in organization org-Xftkx2RCfzMHxshpuGkJcgeX on tokens per min (TPM): Limit 30000, Requested 31537. The input or output tokens must be reduced in order to run successfully. Visit https://platform.openai.com/account/rate-limits to learn 