## import

In [1]:
import base64
import json
import requests
import time
import os

## Install pdf

## API setting

In [4]:
from dotenv import load_dotenv

# .env 파일 로드
load_dotenv()

# 키 가져오기
API_KEY = os.getenv("UPSTAGE_API_KEY")

SCHEMA_API_URL = "https://api.upstage.ai/v1/information-extraction/schema-generation"
EXTRACTION_API_URL = "https://api.upstage.ai/v1/information-extraction"

HEADERS = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

## Input Data

In [5]:
user_input = """Chest PA-Lat XR
 Imaging Study
 Xray Chest PA and Lateral
 Exam: 2 views of the chest XXXX/XXXX.
 Comparison: None.
 Indication: Positive TB test
 Findings:
 The cardiac silhouette and mediastinum size are within normal limits.
 There is no pulmonary edema. There is no focal consolidation. There
 are no XXXX of a pleural effusion. There is no evidence of
 pneumothorax.
 Impression:
 Normal chest x-XXXX.
 This examination and reported findings have been reviewed and
 confirmed by the undersigned"""

## Data processing (PDF)

In [6]:
from fpdf import FPDF
import textwrap

def text_to_pdf(text, pdf_path="user_input.pdf"):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Helvetica", size=12)

    # 텍스트를 줄바꿈 포함하여 100자 단위로 자름
    lines = []
    for line in text.split('\n'):
        wrapped = textwrap.wrap(line, width=100)
        lines.extend(wrapped if wrapped else [" "])  # 빈 줄 유지

    # 각 줄을 PDF에 cell로 출력
    for line in lines:
        pdf.cell(0, 10, line, ln=True)

    pdf.output(pdf_path)
    return pdf_path


In [7]:
import base64

def encode_pdf_to_base64(pdf_path):
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()
        return base64.b64encode(pdf_bytes).decode("utf-8")


In [8]:
pdf_path = text_to_pdf(user_input)
base64_pdf = encode_pdf_to_base64(pdf_path)


  pdf.cell(0, 10, line, ln=True)


## Automatic schema generation

In [9]:
schema_payload = {
    "model": "information-extract",  
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url", 
                    # 파일의 MIME 타입을 application/pdf로 지정
                    "image_url": {"url": f"data:application/pdf;base64,{base64_pdf}"}
                }
            ]
        }
    ]
}

print("자동 스키마 생성 요청 중...")
schema_response = requests.post(SCHEMA_API_URL, headers=HEADERS, json=schema_payload)

자동 스키마 생성 요청 중...


In [10]:
if schema_response.status_code == 200:
    schema_json = schema_response.json()
    # 응답 형식은 OpenAI Chat Completion Object와 유사하며, 
    # 자동 생성된 스키마는 choices[0].message.content에 문자열화된 JSON으로 포함되어 있습니다.
    schema_str = schema_json["choices"][0]["message"]["content"]
    schema_generated = json.loads(schema_str)
    print("자동 생성된 스키마:")
    print(json.dumps(schema_generated, indent=2, ensure_ascii=False))
else:
    print(f"스키마 생성 실패, 상태 코드: {schema_response.status_code}")
    print(schema_response.text)
    schema_generated = None

자동 생성된 스키마:
{
  "type": "json_schema",
  "json_schema": {
    "name": "document_schema",
    "schema": {
      "type": "object",
      "properties": {
        "studyType": {
          "type": "string",
          "description": "Type of imaging study performed."
        },
        "examDescription": {
          "type": "string",
          "description": "Description of the exam conducted."
        },
        "comparison": {
          "type": "string",
          "description": "Comparison with previous studies, if any."
        },
        "indication": {
          "type": "string",
          "description": "Reason for the imaging study."
        },
        "findings": {
          "type": "string",
          "description": "Detailed findings from the imaging study."
        },
        "impression": {
          "type": "string",
          "description": "Summary impression based on the findings."
        },
        "reviewedBy": {
          "type": "boolean",
          "description": "Indi

## Requset

In [11]:
if schema_generated:
    extraction_payload = {
        "model": "information-extract",  # 동일한 모델 사용
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:application/pdf;base64,{base64_pdf}"}
                    }
                ]
            }
        ],
        "response_format": schema_generated  # 자동 생성된 스키마 적용
    }
    
    print("정보 추출 요청 중...")
    extraction_response = requests.post(EXTRACTION_API_URL, headers=HEADERS, json=extraction_payload)
    
    if extraction_response.status_code == 200:
        extraction_json = extraction_response.json()
        extraction_str = extraction_json["choices"][0]["message"]["content"]
        extracted_info = json.loads(extraction_str)
        print("추출된 정보:")
        print(json.dumps(extracted_info, indent=2, ensure_ascii=False))
    else:
        print(f"정보 추출 요청 실패, 상태 코드: {extraction_response.status_code}")
        print(extraction_response.text)
else:
    print("자동 스키마 생성이 실패하여 정보 추출을 진행할 수 없습니다.")

정보 추출 요청 중...
추출된 정보:
{
  "studyType": "Xray Chest PA and Lateral",
  "examDescription": "2 views of the chest X x.",
  "comparison": "None.",
  "indication": "Positive TB test",
  "findings": "The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.",
  "impression": "Normal chest x-XXXX.",
  "reviewedBy": true
}
