In [1]:
import openai
from openai.error import RateLimitError
import time
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
def Generation_prompt(ophthalcontents):
    alpaca_format_str = f"""content:
{ophthalcontents}

Example: 
[Q]: What are the current treatment options available for diabetic retinopathy, and how effective are they in managing the condition?
[A]: The main treatments for diabetic retinopathy include laser photocoagulation, which helps seal or shrink leaking blood vessels to reduce the risk of severe vision loss; intravitreal injections of anti-VEGF medications (such as ranibizumab or aflibercept) to control abnormal blood vessel growth and macular edema; intravitreal corticosteroid injections to decrease inflammation and swelling; and vitrectomy for advanced cases involving persistent hemorrhage or tractional retinal detachment. Maintaining strict control of blood sugar, blood pressure, and cholesterol levels is also essential for preventing or slowing the progression of diabetic retinopathy, and regular eye examinations enable early detection and timely treatment to preserve vision.

For example, if the text is not related to ophthalmology:
[Q]: It is not related to ophthalmology.
[A]: It is not related to ophthalmology."""

    return alpaca_format_str

In [3]:
GPT_system_prompt = """Please carefully review the provided text taken from an ophthalmology textbook and create English Q&A pairs that adhere to the following guidelines. All questions should strictly rely on the textbook content, covering both fundamental and advanced ophthalmology topics without including any identifying information or unique references from the original document. Ensure the answers are detailed but can be answered solely with the information in the given text, and if the text does not allow for any ophthalmology-related Q&A, provide a placeholder pair by writing “It is not related to ophthalmology” as both the question and answer. Maintain factual accuracy, consistency, and a professional academic tone in all Q&A pairs to ensure they are suitable for training a specialized large language model in ophthalmology."""

In [4]:
def call_openai_api(prompt):
    while True:  # 재시도 루프
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": GPT_system_prompt},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.0,
                # max_tokens=4096,
                n=1,
                stop=None,
            )
            return response["choices"][0]["message"]["content"]
        except RateLimitError as e:
            print(f"Rate limit reached: {e}. Waiting for 10 seconds before retrying...")
            time.sleep(10)

In [5]:
import pandas as pd

corpus_file = "./Source Data/Ophthalmology Textbook Pretraining Dataset.xlsx"

corpus_df = pd.read_excel(corpus_file)
corpus_list = corpus_df["text"].values.tolist()
print(len(corpus_list))

10796


In [6]:
answer_list = []
count = 2180
for idx in range(2180, 5390):
    ophthal_content = corpus_list[idx]
    prompt_ = Generation_prompt(ophthal_content)
    answer_ = call_openai_api(prompt_)
    answer_list.append(answer_)
    count += 1
    if count % 10 == 0:
        print(f"{count} / {len(corpus_list)}")

2190 / 10796
2200 / 10796
2210 / 10796
2220 / 10796
2230 / 10796
2240 / 10796
2250 / 10796
2260 / 10796
2270 / 10796
2280 / 10796
2290 / 10796
2300 / 10796
2310 / 10796
2320 / 10796
2330 / 10796
2340 / 10796
2350 / 10796
2360 / 10796
2370 / 10796
2380 / 10796
2390 / 10796
2400 / 10796
2410 / 10796
2420 / 10796
2430 / 10796
2440 / 10796
2450 / 10796
2460 / 10796
2470 / 10796
2480 / 10796
2490 / 10796
2500 / 10796
2510 / 10796
2520 / 10796
2530 / 10796
2540 / 10796
2550 / 10796
2560 / 10796
2570 / 10796
2580 / 10796
2590 / 10796
2600 / 10796
2610 / 10796
2620 / 10796
2630 / 10796
2640 / 10796
2650 / 10796
2660 / 10796
2670 / 10796
2680 / 10796
2690 / 10796
2700 / 10796
2710 / 10796
2720 / 10796
2730 / 10796
2740 / 10796
2750 / 10796
2760 / 10796
2770 / 10796
2780 / 10796
2790 / 10796
2800 / 10796
2810 / 10796
2820 / 10796
2830 / 10796
2840 / 10796
2850 / 10796
2860 / 10796
2870 / 10796
2880 / 10796
2890 / 10796
2900 / 10796
2910 / 10796
2920 / 10796
2930 / 10796
2940 / 10796
2950 / 10796

In [None]:
print(answer_list[3])

In [7]:
import pandas as pd

df = pd.DataFrame(answer_list, columns=["answer"])
df.to_excel(
    "./Generated Ophthalmology QA/Generated Ophthalmology QA_2180_5390.xlsx",
    index=False,
)