In [1]:
import openai
from openai.error import RateLimitError
import time
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
def Generation_prompt(ophthalcontents):
    alpaca_format_str = f"""given content: {ophthalcontents}\n\ncorpus data: """
    
    return alpaca_format_str

In [3]:
GPT_system_prompt = \
"""You are an experienced ophthalmologist and medical editor. Please create corpus data suitable for pre-training of LLM using the given content. You cannot create content that is not included in the given content, and you must use only content that is related to ophthalmology. All non-textual data such as images, tables, and graphs, as well as personal identifiers or document-specific markers, must be removed. It must not contain author names, affiliations, unique references, or metadata not related to the ophthalmic content. Do not use headings or bullet points, present the content in natural language, and ensure that each sentence ends with a full stop. If there is no content that meets these criteria, display “No Content.”"""

In [4]:
def call_openai_api(prompt):
    while True:  # 재시도 루프
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": GPT_system_prompt},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.0,
                # max_tokens=8192,
                n=1,
                stop=None,
            )
            return response['choices'][0]['message']['content']
        except RateLimitError as e:
            print(f"Rate limit reached: {e}. Waiting for 10 seconds before retrying...")
            time.sleep(10)

In [6]:
import os
import pandas as pd
# 타이머를 위해 time 모듈 삭제

# 엑셀 파일이 있는 디렉토리 경로
directory_path = "./Parsed Ophthalmology PubMed/"
output_directory = "./Refined Ophthalmology PubMed"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# 디렉토리 내 모든 엑셀 파일에 대해 반복
count = 0
for filename in os.listdir(directory_path):
    if filename.endswith(".xlsx"):
        # 시작 시간 기록 삭제
        corpus_file = os.path.join(directory_path, filename)
        count += 1
        
        # 결과를 저장할 파일 경로
        output_file = f'./{output_directory}/{filename.replace(".xlsx", "_refined.xlsx")}'
        
        # 파일이 이미 존재하는지 확인
        if os.path.exists(output_file):
            print(f"Skipping {filename} as refined file already exists.")
            continue
        
        corpus_df = pd.read_excel(corpus_file)
        corpus_list = corpus_df["contents"].values.tolist()
        print(f"Processing {filename}: {len(corpus_list)} entries")

        answer_list = []
        for idx in range(len(corpus_list)):
            # 60초 초과 시 종료 조건 삭제

            ophthal_content = corpus_list[idx]
            prompt_ = Generation_prompt(ophthal_content)
            answer_ = call_openai_api(prompt_)
            answer_list.append(answer_)
        
        # 60초 내에 처리된 경우에만 파일 저장 조건 삭제
        df = pd.DataFrame(answer_list, columns=['answer'])
        df.to_excel(output_file, index=False)
        print(f"Processed {count} entries for {filename}")
        
        # 60초 초과 시 다음 파일로 넘어가지 않음 조건 삭제

Skipping 10873994.xlsx as refined file already exists.
Skipping 12543742.xlsx as refined file already exists.
Skipping 14189705.xlsx as refined file already exists.
Skipping 15318671.xlsx as refined file already exists.
Skipping 16782956.xlsx as refined file already exists.
Skipping 17804924.xlsx as refined file already exists.
Skipping 18974526.xlsx as refined file already exists.
Skipping 19789662.xlsx as refined file already exists.
Skipping 20157413.xlsx as refined file already exists.
Skipping 20321588.xlsx as refined file already exists.
Skipping 20514260.xlsx as refined file already exists.
Skipping 20824865.xlsx as refined file already exists.
Skipping 21609425.xlsx as refined file already exists.
Skipping 21629575.xlsx as refined file already exists.
Skipping 22606489.xlsx as refined file already exists.
Skipping 23412528.xlsx as refined file already exists.
Skipping 23511997.xlsx as refined file already exists.
Skipping 23633345.xlsx as refined file already exists.
Skipping 2