In [1]:
import openai
from openai.error import RateLimitError
import time
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
def Generation_prompt(ophthalcontents):
    alpaca_format_str = f"""text:\n{ophthalcontents}\n\nresult:\n"""
    
    return alpaca_format_str

In [3]:
GPT_system_prompt = \
"""You are an expert ophthalmologist and medical editor. You have been given text extracted from an ophthalmology textbook. Your job is to summarize, and structure this text into a corpus format suitable for training a large language model (LLM) specialized in ophthalmology.
When reorganizing, please remove all non-textual data, such as pictures, diagrams, etc. Also, please remove all personally identifiable information and unique document information, and reorganize in natural writing without bullet points as much as possible. Remove any references to chapter numbers, section headings, page numbers, or other structural details from the original source. Only include the core ophthalmology content itself, excluding any metadata or documentation-specific markers that do not directly contribute to the medical information. Also, all sentences should end."""

In [4]:
def call_openai_api(prompt):
    while True:  # 재시도 루프
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": GPT_system_prompt},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.0,
                # max_tokens=4096,
                n=1,
                stop=None,
            )
            return response['choices'][0]['message']['content']
        except RateLimitError as e:
            print(f"Rate limit reached: {e}. Waiting for 10 seconds before retrying...")
            time.sleep(10)

In [5]:
import pandas as pd

corpus_file = "./Parsed Ophthalmology Textbooks/BCSC_section12_parsed.xlsx"

corpus_df = pd.read_excel(corpus_file)
corpus_list = corpus_df["contents"].values.tolist()

In [6]:
answer_list = []
count = 0
for idx in range(len(corpus_list)):
    ophthal_content = corpus_list[idx]
    prompt_ = Generation_prompt(ophthal_content)
    answer_ = call_openai_api(prompt_)
    answer_list.append(answer_)
    count += 1
    if count % 10 == 0:
        print(f"{count} / {len(corpus_list)}")

10 / 891
20 / 891
30 / 891
40 / 891
50 / 891
60 / 891
70 / 891
80 / 891
90 / 891
100 / 891
110 / 891
120 / 891
130 / 891
140 / 891
150 / 891
160 / 891
170 / 891
180 / 891
190 / 891
200 / 891
210 / 891
220 / 891
230 / 891
240 / 891
250 / 891
260 / 891
270 / 891
280 / 891
290 / 891
300 / 891
310 / 891
320 / 891
330 / 891
340 / 891
350 / 891
360 / 891
370 / 891
380 / 891
390 / 891
400 / 891
410 / 891
420 / 891
430 / 891
440 / 891
450 / 891
460 / 891
470 / 891
480 / 891
490 / 891
500 / 891
510 / 891
520 / 891
530 / 891
540 / 891
550 / 891
560 / 891
570 / 891
580 / 891
590 / 891
600 / 891
610 / 891
620 / 891
630 / 891
640 / 891
650 / 891
660 / 891
670 / 891
680 / 891
690 / 891
700 / 891
710 / 891
720 / 891
730 / 891
740 / 891
750 / 891
760 / 891
770 / 891
780 / 891
790 / 891
800 / 891
810 / 891
820 / 891
830 / 891
840 / 891
850 / 891
860 / 891
870 / 891
880 / 891
890 / 891


In [7]:
import pandas as pd

df = pd.DataFrame(answer_list, columns=['answer'])
df.to_excel('./Refined Ophthalmology Textbooks/BCSC_section12_refined.xlsx', index=False)