In this coding file, we aimed to **summarize the abstract and generate supplementary descriptions** based on the paper‚Äôs 'title' and 'abstract' fields, utilizing the ChatGPT-4o-mini model API. The final file is jsonl and json -> input for visualization as hover.

In [None]:
#@title 1) API Installation & Settings

!pip install -q openai pandas

import pandas as pd
import json
import os
import time
from openai import OpenAI
from google.colab import files

API_KEY = "[**input api key]"

INPUT_FILE_NAME = "/content/Cleaned data/2025_cleaned_data.csv"
SAVE_DIR = "/content/Processed Data"

STREAM_FILE_NAME = "2025_summaries_cache_stream.jsonl"
FINAL_FILE_NAME = "2025_paper_summaries_final.json"

STREAM_FILE = os.path.join(SAVE_DIR, STREAM_FILE_NAME)
FINAL_FILE = os.path.join(SAVE_DIR, FINAL_FILE_NAME)

BATCH_SIZE = 20 # input 20 papers per batch
MODEL = "gpt-4o-mini"

client = OpenAI(api_key=API_KEY)

if not os.path.exists(INPUT_FILE_NAME):
    print("CSV not found")


In [None]:
#@title 2) LLM Prompt

SYSTEM_PROMPT = "You are an academic research analyst. Extract structured metadata and provide balanced, objective summaries in English. Give equal weight to the research process and the findings. Output JSON only."

USER_PROMPT_TEMPLATE = """
Extract information in EXACT JSON format:
{{
  "paper_id": "{paper_id}",
  "title": "{title}",
  "domain": "Primary research field",
  "problem": "Context & Question: Describe the background or the gap in knowledge being addressed. (1-2 sentences)",
  "methodology": "Research Approach: Detail the study design, subjects, and specific analytical steps. (1-2 sentences)",
  "data_type": "Type of data analyzed",
  "techniques_tools": ["Key analytical tools, software, or laboratory techniques"],
  "key_concepts": ["3-5 core scientific concepts"],
  "main_findings": "Observed Results: State the primary findings objectively based on evidence. (1-2 sentences)",
  "summary_short": "Scientific Summary (English): Provide a cohesive, formal academic narrative. Ensure the 'How' (methodology) and the 'What' (outcomes) are balanced. End with its potential application or contribution to the field without overstating it.",
  "summary_simple": "Simplified Explanation (English - Age 18): Explain the research logic for a high school senior. Describe the initial uncertainty, the steps taken by researchers, and the observed reality. Use 'Term (short explanation)' for jargon. Use formal yet accessible language.",
  "practical_relevance": "Contextual Significance: State how this advances the field or contributes to fundamental knowledge.",
  "data_quality_flag": "VALID"
}}

Strict Rules:
1. EXPLICIT INFO ONLY: Use ONLY information explicitly stated in the provided abstract. Do NOT guess or use outside knowledge.
2. COMPARISON RULE: If the study involves comparison (e.g., A vs B, control vs experimental), explicitly state what was compared and which performed better or showed significant differences.
3. INSUFFICIENT INFO: If information is insufficient for any specific field, state "Not mentioned".
4. DATA QUALITY: If the abstract is missing, empty, or too short to provide a meaningful summary, set "data_quality_flag": "INVALID_ABSTRACT".
5. LANGUAGE: All summary fields must be in ENGLISH.
6. BALANCED WEIGHT: Ensure the methodology and results receive equal detail in the summaries. Do not overlook the research process.
7. TONE: Use a professional, neutral, and descriptive tone. Avoid conversational fillers.
8. JARGON: In 'summary_simple', always explain technical terms in parentheses immediately after.

Paper Details:
ID: {paper_id} | Title: {title}
Abstract: {abstract}
"""

In [None]:
#@title 3) Core Functions

def call_openai_batch(batch_papers):
    combined_prompt = "Process these papers and return a JSON list of objects:\n\n"
    for p in batch_papers:
        combined_prompt += USER_PROMPT_TEMPLATE.format(
            paper_id=p['EID'], title=p['Title'], abstract=p['Abstract']
        ) + "\n---\n"

    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": combined_prompt}
            ],
            response_format={"type": "json_object"}
        )
        res_data = json.loads(response.choices[0].message.content)
        for key in res_data:
            if isinstance(res_data[key], list): return res_data[key]
        return [res_data] if isinstance(res_data, dict) else []
    except Exception as e:
        print(f"API Error: {e}")
        return None

def append_to_jsonl(results, filename):
    with open(filename, 'a', encoding='utf-8') as f:
        for item in results:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
#@title 4) Run Process & Download JSON

def main():
    if not API_KEY:
        print("enter API Key to Step 1"); return

    if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR)

    df = pd.read_csv(INPUT_FILE_NAME).fillna("")
    done_ids = set()

    if os.path.exists(STREAM_FILE):
        with open(STREAM_FILE, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    d = json.loads(line)
                    done_ids.add(str(d.get('paper_id')))
                except: continue

    to_process = [
        {'EID': str(row['EID']), 'Title': row['Title'], 'Abstract': row['Abstract']}
        for _, row in df.iterrows() if str(row['EID']) not in done_ids
    ]

    print(f"üìä ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {len(df)} | ‡∏ó‡∏≥‡πÅ‡∏•‡πâ‡∏ß: {len(done_ids)} | ‡∏£‡∏≠‡∏Ñ‡∏¥‡∏ß: {len(to_process)}")

    for i in range(0, len(to_process), BATCH_SIZE): # start the running process
        batch = to_process[i:i+BATCH_SIZE]
        print(f"Proceeding Batch {i//BATCH_SIZE + 1}")

        batch_lookup = {str(p['EID']): p['Abstract'] for p in batch} # initiate the lookup

        results = call_openai_batch(batch)
        if results:
            for item in results:
                eid_key = str(item.get('paper_id'))
                item['original_abstract'] = batch_lookup.get(eid_key, "Abstract not found")

            append_to_jsonl(results, STREAM_FILE)
            print(f"Save batch {i//BATCH_SIZE + 1}")

        time.sleep(1)

    if os.path.exists(STREAM_FILE):
        with open(STREAM_FILE, 'r', encoding='utf-8') as f:
            final_list = [json.loads(line) for line in f]
        with open(FINAL_FILE, 'w', encoding='utf-8') as f:
            json.dump(final_list, f, ensure_ascii=False, indent=4)
        print(f"final file is at: {FINAL_FILE}")
        files.download(FINAL_FILE)

main()