In [1]:
import os
import json
import requests
import re
import tempfile
from pdf2image import convert_from_path
import img2pdf
import fitz
from PIL import Image
import io
import pandas as pd
import time
from langchain_upstage import UpstageLayoutAnalysisLoader
# from rainbow_html_transformer_old import HTMLToTextWithMarkdownTables
from rainbow_html_transformer import HTMLToTextWithMarkdownTables
from tqdm import tqdm

In [None]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

# 환경변수에서 API 키 가져오기
API_KEY = os.getenv("UPSTAGE_API_KEY")

In [2]:
def get_pdf_pages(pdf_path):
    doc = fitz.open(pdf_path)
    return len(doc)

In [3]:
def extract_text_with_ocr(pdf_path, max_pages=3):
    extracted_text = []
    html_transformer = HTMLToTextWithMarkdownTables()
    
    try:
        images = convert_from_path(pdf_path, first_page=1, last_page=max_pages)
        
        print(f"Processing {len(images)} pages...")
        for i, img in enumerate(tqdm(images, desc="Processing pages", unit="page")):
            with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
                img.save(temp_file, format="PNG")
                temp_file_path = temp_file.name
            
            try:
                loader = UpstageLayoutAnalysisLoader(
                    file_path=temp_file_path,
                    use_ocr=True
                )
                documents = loader.load()
                
                if not documents:
                    print(f"Warning: No content extracted from page {i+1}")
                    continue

                page_content = ""
                for doc in documents:
                    if doc is None or not doc.page_content:
                        print(f"Warning: Empty document encountered for page {i+1}")
                        continue
                    transformed_doc = html_transformer.transform_document(doc)
                    page_content += transformed_doc.page_content

                # 페이지 내용 정리
                page_content = page_content.strip()
                if page_content:
                    page_content += f"\n\n--- End of Page {i+1} ---\n\n"
                    extracted_text.append(page_content)
                else:
                    print(f"Warning: No content extracted from page {i+1}")
            except Exception as e:
                print(f"Error processing page {i+1}: {e}")
            finally:
                os.unlink(temp_file_path)
    
    except Exception as e:
        print(f"Error processing {pdf_path} with OCR: {e}")
    
    if not extracted_text:
        print("Warning: No text was extracted from the document.")
    
    return ''.join(extracted_text)

In [None]:
def extract_text_with_async_ocr(pdf_path, max_pages=1000):
    # api_key = "YOUR_UPSTAGE_API_KEY"
    url = "https://api.upstage.ai/v1/document-ai/async/layout-analysis"
    headers = {
        "Authorization": f"Bearer {API_KEY}"
    }
    
    with open(pdf_path, "rb") as file:
        files = {"document": file}
        data = {"ocr": "true"}
        
        # Submit the inference request
        response = requests.post(url, headers=headers, files=files, data=data)
        response.raise_for_status()
        request_id = response.json()["request_id"]
    
    # Poll for results
    status_url = f"https://api.upstage.ai/v1/document-ai/requests/{request_id}"
    while True:
        response = requests.get(status_url, headers=headers)
        response.raise_for_status()
        status_data = response.json()
        
        if status_data["status"] == "completed":
            break
        elif status_data["status"] == "failed":
            raise Exception(f"Processing failed: {status_data['failure_message']}")
        
        time.sleep(10)  # Wait before polling again
    
    # Download and process results
    extracted_text = []
    for batch in status_data["batches"]:
        download_url = batch["download_url"]
        batch_response = requests.get(download_url)
        batch_response.raise_for_status()
        batch_data = batch_response.json()
        
        for page in batch_data["pages"]:
            page_content = "\n".join([block["text"] for block in page["blocks"]])
            extracted_text.append(page_content)
            extracted_text.append(f"\n\n--- End of Page {page['page_num']} ---\n\n")
    
    return "".join(extracted_text)

In [4]:
# 사용 예:
# Layout Analysis API의 최대 페이지 제한인 100페이지 이상은 처리 불가
folder_path = "./raw_docs"
output_folder_path = "./processed_txt"
# pdf_file_name = "판매약관_ThePride신한참좋은치아보험PlusⅡ(무배당, 갱신형)_20240401_P252.pdf"
# pdf_file_name = "SHL0165_The안심VIP저축보험Ⅱ(무배당)_P116.pdf"
# pdf_file_name = "DB자산관리_약관_20240401_P22.pdf"
# pdf_file_name = "IRP자산관리_약관(기업형)_20240401_P24.pdf"
pdf_file_name = "./상품1/판매약관_신한(간편가입)모아더드림종신보험(무배당, 해약환급금 일부지급형)_20240610.pdf"
print(f"Processing file : {pdf_file_name}")
pdf_path = os.path.join(folder_path, pdf_file_name)

max_pages = get_pdf_pages(pdf_path)
ocr_text = extract_text_with_ocr(pdf_path, max_pages)

# 텍스트 파일로 저장
output_file_name = os.path.splitext(os.path.basename(pdf_file_name))[0] + ".txt"
output_path = os.path.join(output_folder_path, output_file_name)

# 출력 디렉토리가 존재하지 않으면 생성
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write(ocr_text)

print(f"OCR text saved to: {output_path}")

# 저장된 텍스트 출력 (선택사항)
print("Extracted text:")
print(ocr_text[:500] + "...")  # 처음 500자만 출력

Processing file : ./상품1/판매약관_신한(간편가입)모아더드림종신보험(무배당, 해약환급금 일부지급형)_20240610.pdf
Processing 248 pages...


Processing pages:   2%|▏         | 6/248 [00:59<34:38,  8.59s/page]



Processing pages:   4%|▍         | 10/248 [01:15<19:28,  4.91s/page]



Processing pages:  11%|█         | 27/248 [10:58<8:36:15, 140.16s/page]

Error processing page 27: Failed to send request: HTTPSConnectionPool(host='api.upstage.ai', port=443): Max retries exceeded with url: /v1/document-ai/layout-analysis (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:992)')))


Processing pages:  27%|██▋       | 66/248 [19:34<31:22, 10.34s/page]   

Error processing page 66: Failed to send request: HTTPSConnectionPool(host='api.upstage.ai', port=443): Max retries exceeded with url: /v1/document-ai/layout-analysis (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f4619a1be50>: Failed to resolve 'api.upstage.ai' ([Errno -3] Temporary failure in name resolution)"))


Processing pages:  27%|██▋       | 68/248 [36:08<1:35:38, 31.88s/page]


KeyboardInterrupt: 