In [1]:
import os
import json
import re
import tempfile
from pdf2image import convert_from_path
import img2pdf
import fitz
from PIL import Image
import io
import pandas as pd
import time
from langchain_upstage import UpstageLayoutAnalysisLoader
from rainbow_html_transformer import HTMLToTextWithMarkdownTables
from tqdm import tqdm

In [2]:
def get_pdf_pages(pdf_path):
    doc = fitz.open(pdf_path)
    return len(doc)

In [3]:
def extract_text_with_ocr(pdf_path, max_pages=3):
    extracted_text = []
    try:
        # PDF를 이미지로 변환
        images = convert_from_path(pdf_path, first_page=1, last_page=max_pages)
        
        print(f"Processing {len(images)} pages...")
        for i, img in enumerate(tqdm(images, desc="Processing pages", unit="page")):
            with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as temp_file:
                img.save(temp_file, format="PNG")
                temp_file_path = temp_file.name
            
            try:
                loader = UpstageLayoutAnalysisLoader(
                    file_path=temp_file_path,
                    use_ocr=True
                )
                documents = loader.load()
                
                html_transformer = HTMLToTextWithMarkdownTables()
                for doc in documents:
                    # HTML 태그 제거
                    transformed_doc = html_transformer.transform_documents([doc])[0]
                    extracted_text.append(transformed_doc.page_content)
            finally:
                os.unlink(temp_file_path)
    
    except Exception as e:
        print(f"Error processing {pdf_path} with OCR: {e}")
    
    return ' '.join(extracted_text)

In [4]:
# 사용 예:
# Layout Analysis API의 최대 페이지 제한인 100페이지 이상은 처리 불가
folder_path = "./raw_docs"
output_folder_path = "./processed_txt"
pdf_file_name = "판매약관_ThePride신한참좋은치아보험PlusⅡ(무배당, 갱신형)_20240401_P252.pdf"

print(f"Processing file : {pdf_file_name}")
pdf_path = os.path.join(folder_path, pdf_file_name)

max_pages = get_pdf_pages(pdf_path)
ocr_text = extract_text_with_ocr(pdf_path, max_pages)

# 텍스트 파일로 저장
output_file_name = os.path.splitext(pdf_file_name)[0] + ".txt"
output_path = os.path.join(output_folder_path, output_file_name)

with open(output_path, 'w', encoding='utf-8') as f:
    f.write(ocr_text)

print(f"OCR text saved to: {output_path}")

# 저장된 텍스트 출력 (선택사항)
print("Extracted text:")
print(ocr_text[:500] + "...")  # 처음 500자만 출력

Processing file : 판매약관_ThePride신한참좋은치아보험PlusⅡ(무배당, 갱신형)_20240401_P252.pdf
