In [1]:
import re
import os
import click
import pdfplumber
import tempfile
from pdf2image import convert_from_path
import img2pdf
from PIL import Image
import io
import pandas as pd
from collections import defaultdict
from langchain_upstage import UpstageLayoutAnalysisLoader
from langchain.schema import Document
from rainbow_html_transformer import HTMLToTextWithMarkdownTables


In [2]:
def extract_text_with_page_info(pdf_path):
    """PDF에서 페이지 정보를 포함한 텍스트를 추출합니다."""
    text_with_page_info = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            text_with_page_info.append((page_num, text))
    return text_with_page_info


In [3]:
# def extract_text_with_page_info(pdf_path):
#     """UpstageLayoutAnalysisLoader를 사용하여 PDF에서 페이지 정보를 포함한 텍스트를 추출합니다."""
#     loader = UpstageLayoutAnalysisLoader(pdf_path, split="page")
#     documents = loader.load()
    
#     text_with_page_info = []
#     # HTML 문서를 텍스트로 변환하는 변환기 생성
#     html_transformer = HTMLToTextWithMarkdownTables()
        
#     for doc in documents:
#         transformed_doc = html_transformer.transform_documents([doc])[0]
#         page_number = transformed_doc.metadata['page']  # 여기서 페이지 번호를 추출합니다
#         text_with_page_info.append((page_number, transformed_doc.page_content))
#         print(f"Processing page {page_number}")  # 디버깅을 위한 출력

    
#     return text_with_page_info
def convert_pdf_to_pdf(input_path, output_path):
    """PDF를 이미지로 변환한 후 다시 PDF로 변환합니다."""
    with tempfile.TemporaryDirectory() as temp_dir:
        # PDF를 이미지로 변환
        images = convert_from_path(input_path)
        
        # 이미지를 바이트 스트림으로 변환
        image_bytes = []
        for img in images:
            byte_arr = io.BytesIO()
            img.save(byte_arr, format='PNG')
            image_bytes.append(byte_arr.getvalue())
        
        # 이미지를 PDF로 변환
        with open(output_path, "wb") as f:
            f.write(img2pdf.convert(image_bytes))

def extract_text_with_page_info(pdf_path):
    """UpstageLayoutAnalysisLoader를 사용하여 PDF에서 페이지 정보를 포함한 텍스트를 추출하고 저장합니다."""
    try:
        loader = UpstageLayoutAnalysisLoader(
            pdf_path,
            split="page",
            use_ocr=True,  # OCR 활성화
            # ocr_languages=["eng", "kor"],  # OCR 언어 설정 (영어와 한국어)
            exclude=["annotations"]
        )
        documents = loader.load()
    except KeyError as e:
        print(f"Error processing {pdf_path}: {e}")
        print("Attempting to convert and reprocess the PDF...")
        
        # 임시 파일 생성
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_file:
            temp_pdf_path = temp_file.name
        
        # PDF 변환
        convert_pdf_to_pdf(pdf_path, temp_pdf_path)
        
        # 변환된 PDF로 다시 시도
        try:
            loader = UpstageLayoutAnalysisLoader(
                        temp_pdf_path,
                        split="page",
                        use_ocr=True,  # OCR 활성화
                        # ocr_languages=["eng", "kor"],  # OCR 언어 설정 (영어와 한국어)
                        exclude=["annotations"]
                    )
            documents = loader.load()
        except Exception as e:
            print(f"Error processing converted PDF: {e}")
            os.unlink(temp_pdf_path)  # 임시 파일 삭제
            return []  # 빈 리스트 반환 또는 다른 적절한 처리
        
        os.unlink(temp_pdf_path)  # 임시 파일 삭제
    
    text_with_page_info = []
    html_transformer = HTMLToTextWithMarkdownTables()
    
    # PDF 파일 이름 추출 (확장자 제외)
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    # 저장할 디렉토리 생성
    output_dir = "./processed_txt"
    os.makedirs(output_dir, exist_ok=True)
    
    for doc in documents:
        transformed_doc = html_transformer.transform_documents([doc])[0]
        page_number = transformed_doc.metadata['page']
        text_content = transformed_doc.page_content
        
        # 텍스트 파일로 저장
        output_html_filename = f"{pdf_name}_Page_{page_number}.html"
        output_html_path = os.path.join(output_dir, output_html_filename)
        
        with open(output_html_path, 'w', encoding='utf-8') as file:
            file.write(doc.page_content)

        # 텍스트 파일로 저장
        output_filename = f"{pdf_name}_Page_{page_number}.txt"
        output_path = os.path.join(output_dir, output_filename)
        
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text_content)
        
        text_with_page_info.append((page_number, text_content))
        print(f"Processed and saved page {page_number} to {output_filename}")

    return text_with_page_info

In [4]:
def split_text_into_sections_with_metadata(text_with_page_info):
    """텍스트를 페이지 및 섹션, 서브 섹션 메타데이터와 함께 분리합니다."""
    sections = []
    
    section_pattern = re.compile(r'^\d+\.\s[^\n]+', re.MULTILINE)
    subsection_pattern = re.compile(r'^\d+\.\d+\.\s[^\n]+', re.MULTILINE)
    
    current_section = None
    current_subsection = None

    for page_num, text in text_with_page_info:
        for line in text.splitlines():
            section_match = section_pattern.match(line)
            subsection_match = subsection_pattern.match(line)

            if section_match:
                current_section = section_match.group().strip()
                current_subsection = None  # 섹션이 변경되면 서브섹션 초기화
                sections.append({
                    'Page': page_num,
                    'Section': current_section,
                    'Subsection': '',
                    'Content': ''
                })
            elif subsection_match and current_section:
                current_subsection = subsection_match.group().strip()
                sections.append({
                    'Page': page_num,
                    'Section': current_section,
                    'Subsection': current_subsection,
                    'Content': ''
                })
            elif current_section:
                if current_subsection:
                    sections[-1]['Content'] += " " + line.strip()
                else:
                    sections[-1]['Content'] += " " + line.strip()

    return sections

In [5]:
def sections_to_dataframe_with_metadata(sections, file_name):
    """섹션과 메타데이터를 포함한 데이터프레임으로 변환합니다."""
    data = []
    for section_data in sections:
        data.append([
            file_name,
            section_data['Page'],
            section_data['Section'],
            section_data['Subsection'],
            section_data['Content'].strip()
        ])
    
    df = pd.DataFrame(data, columns=["File", "Page", "Section", "Subsection", "Content"])
    return df

def save_sections_to_excel(df, output_path):
    """데이터프레임을 엑셀 파일로 저장합니다."""
    df.to_excel(output_path, index=False)


In [6]:
def process_pdfs_in_directory(pdf_directory, output_excel_path):
    """디렉토리 내 모든 PDF 파일을 처리하여 결과를 엑셀 파일로 저장합니다."""
    all_dataframes = []
    for file_name in os.listdir(pdf_directory):
        if file_name.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, file_name)
            text_with_page_info = extract_text_with_page_info(pdf_path)
            sections = split_text_into_sections_with_metadata(text_with_page_info)
            df = sections_to_dataframe_with_metadata(sections, file_name)
            all_dataframes.append(df)
    
    # 모든 데이터프레임을 하나로 합치기
    final_df = pd.concat(all_dataframes, ignore_index=True)
    save_sections_to_excel(final_df, output_excel_path)

In [7]:
# 사용 예시
pdf_directory = "./raw_docs"  # PDF 파일들이 저장된 디렉토리 경로
output_excel_path = "./output_sections_with_metadata.xlsx"  # 저장할 엑셀 파일 경로

process_pdfs_in_directory(pdf_directory, output_excel_path)

Processed and saved page 1 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_1.txt
Processed and saved page 2 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_2.txt
Processed and saved page 3 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_3.txt
Processed and saved page 4 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_4.txt
Processed and saved page 5 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_5.txt
Processed and saved page 6 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_6.txt
Processed and saved page 7 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_7.txt
Processed and saved page 8 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_8.txt
Processed and saved page 9 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_240401_v2_문구수정(1)_P11_Page_9.txt
Processed and saved page 10 to 027.사업방법서_신한큐브종합건강보장보험(무배당, 해약환급금 미지급형)_24

  table.replace_with(BeautifulSoup(markdown_table, 'html.parser'))


Error processing ./raw_docs/DB자산관리_약관_20240401_P22.pdf: '/S'
Attempting to convert and reprocess the PDF...
Processed and saved page 1 to DB자산관리_약관_20240401_P22_Page_1.txt
Processed and saved page 2 to DB자산관리_약관_20240401_P22_Page_2.txt
Processed and saved page 3 to DB자산관리_약관_20240401_P22_Page_3.txt
Processed and saved page 4 to DB자산관리_약관_20240401_P22_Page_4.txt
Processed and saved page 5 to DB자산관리_약관_20240401_P22_Page_5.txt
Processed and saved page 6 to DB자산관리_약관_20240401_P22_Page_6.txt
Processed and saved page 7 to DB자산관리_약관_20240401_P22_Page_7.txt
Processed and saved page 8 to DB자산관리_약관_20240401_P22_Page_8.txt
Processed and saved page 9 to DB자산관리_약관_20240401_P22_Page_9.txt
Processed and saved page 10 to DB자산관리_약관_20240401_P22_Page_10.txt
Processed and saved page 11 to DB자산관리_약관_20240401_P22_Page_11.txt
Processed and saved page 12 to DB자산관리_약관_20240401_P22_Page_12.txt
Processed and saved page 13 to DB자산관리_약관_20240401_P22_Page_13.txt
Processed and saved page 14 to DB자산관리_약관_20240401_P2