In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from dotenv import load_dotenv
import pandas as pd
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.0, openai_api_key=OPENAI_API_KEY)

In [3]:
GPT_system_prompt = """You are an experienced ophthalmologist and medical editor. Please create corpus data suitable for pre-training of LLM using the given content. You cannot create content that is not included in the given content, and you must use only content that is related to ophthalmology. All non-textual data such as images, tables, and graphs, as well as personal identifiers or document-specific markers, must be removed. It must not contain author names, affiliations, unique references, or metadata not related to the ophthalmic content. Do not use headings or bullet points, present the content in natural language, and ensure that each sentence ends with a full stop. If there is no content that meets these criteria, display “No Content.”"""

In [4]:
def generate_prompt(ophthal_content: str):
    return f"given content: {ophthal_content}\n\ncorpus data: "

def generate_corpus(ophthal_content: str):
    messages = [
        SystemMessage(content=GPT_system_prompt),
        HumanMessage(content=generate_prompt(ophthal_content)),
    ]
    response = llm(messages)
    return response.content

In [None]:
input_dir = "./Parsed Ophthalmology PubMed"
output_dir = "./Refined Ophthalmology PubMed"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".xlsx"):
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(
            output_dir, filename.replace(".xlsx", "_refined.xlsx")
        )

        # 이미 처리된 경우 스킵
        if os.path.exists(output_path):
            print(f"Skipping {filename} (already exists).")
            continue

        print(f"Processing: {filename}")
        df = pd.read_excel(input_path)

        if "contents" not in df.columns:
            print(f"Skipping {filename} (no 'contents' column).")
            continue

        contents = df["contents"].astype(str).tolist()
        results = []

        for i, content in enumerate(contents):
            try:
                refined = generate_corpus(content)
            except Exception as e:
                print(f"Error at index {i}: {e}")
                refined = "Error"
            results.append(refined)

        pd.DataFrame({"answer": results}).to_excel(output_path, index=False)
        print(f"Saved to {output_path}")