In [24]:
import os
import csv
from tqdm import tqdm
import pdfplumber

In [25]:
def is_within_table(word_bbox, table_areas):
    for table_bbox in table_areas:
        if (word_bbox[0] >= table_bbox[0] and word_bbox[2] <= table_bbox[2] and
            word_bbox[1] >= table_bbox[1] and word_bbox[3] <= table_bbox[3]):
            return True
    return False

def extract_word_and_tables(page, doc_id):

    tables = page.find_tables()
    table_areas = [table.bbox for table in tables]

    items = []

    words = page.extract_words()

    cnt = 0

    for word_info in words:
        word_bbox = (word_info['x0'], word_info['top'], word_info['x1'], word_info['bottom'])

        if not is_within_table(word_bbox, table_areas):
            # print(word_info['text'], cnt)
            items.append(('text', word_bbox, word_info['text'], cnt))
        else:
            pass
            # items.append(('table_text', word_bbox, word_info['text']))
        cnt += 1

    # 테이블 추출 및 위치 기록
    for table in tables:
        items.append(('table', table.bbox, table.extract(), None))

    # y좌표, cnt 기준 정렬
    items.sort(key=lambda x: (x[1][1], x[3]))

    # None나오기 전까지 cnt로 다시 정렬 -> y좌표가 애매할 때가 있어서 순서 섞임
    sorted_items = []
    start_idx = 0

    for idx, item in enumerate(items):
        if item[3] is None:
            # None이 나오기 전까지의 부분 items를 x[3](cnt)에 대해 정렬
            sorted_items.extend(sorted(items[start_idx:idx], key=lambda x: x[3]))
            # None인 원소는 그대로 추가
            sorted_items.append(item)
            start_idx = idx + 1

    # 마지막 구간 정렬
    if start_idx < len(items):
        sorted_items.extend(sorted(items[start_idx:], key=lambda x: x[3]))

    doc = doc_id + ' '  # chunk에 문서 제목 추가
    
    for item in sorted_items:
        if item[0] == 'text':
            doc = doc + item[2] + ' '
            # print(item[2], end=' ')
        elif item[0] == 'table':
            doc = doc + '이와 관련된 표.\n'
            # print("이와 관련된 표.")
            for row in item[2]:
                processed_row = []
                for cell in row:
                    if cell is not None:
                        cell = cell.replace('\n', ' ')
                    if not cell:
                        cell = 'x'
                    processed_row.append(cell)
                doc = doc + ' | '.join(processed_row)
                # print(' | '.join(processed_row))
                # print(row)
                doc = doc + " \n "
            doc = doc + " \n "
            # print("\n")
        elif item[0] == 'table_text':
            pass
    # print("\n")
    return doc

In [26]:
cur_path = os.getcwd()
data_dir = os.path.join(cur_path, './data/train_source/')
pdf_files = [f for f in os.listdir(data_dir) if f.endswith('.pdf')]

csv_file_path = os.path.join(cur_path, './processed_data/train_data.csv')

with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['doc_id', 'doc'])

for pdf_file in pdf_files:
    file_name, _ = os.path.splitext(pdf_file)
    print(file_name)
    with pdfplumber.open(data_dir + pdf_file) as pdf:
        for page_num, page in enumerate(tqdm(pdf.pages)):

            page_width = page.width
            
            if page_width > 600:
                mid_x = page_width / 2
                
                tables = page.find_tables()
                table_areas = [table.bbox for table in tables]

                words_and_tables = []

                left_clip = (0, 0, mid_x, page.height)
                right_clip = (mid_x, 0, page_width, page.height)

                left_page = page.within_bbox(left_clip)
                right_page = page.within_bbox(right_clip)

                # print(f'{page_num} left page: ')
                left_doc = extract_word_and_tables(left_page, file_name)
                left_doc_id = file_name + str(page_num) + '1'
                # print(doc)

                # print(f'{page_num} right page: ')
                right_doc = extract_word_and_tables(right_page, file_name)
                right_doc_id = file_name + str(page_num) + '2'
                # print(doc)

                with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csv_file:
                    writer = csv.writer(csv_file)
                    writer.writerow([left_doc_id, left_doc])
                    writer.writerow([right_doc_id, right_doc])

            else:
                # print(f'{page_num} less than 600pt: ')
                doc = extract_word_and_tables(page, file_name)
                doc_id = file_name + str(page_num) + '0'
                # print(doc)
                with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csv_file:
                    writer = csv.writer(csv_file)
                    writer.writerow([doc_id, doc])



「FIS 이슈 & 포커스」 23-3호 《조세지출 연계관리》


100%|██████████| 9/9 [00:02<00:00,  4.01it/s]


국토교통부_소규모주택정비사업


100%|██████████| 4/4 [00:03<00:00,  1.15it/s]


국토교통부_전세임대(융자)


100%|██████████| 4/4 [00:01<00:00,  2.89it/s]


보건복지부_생계급여


100%|██████████| 4/4 [00:00<00:00,  5.84it/s]


2024 나라살림 예산개요


100%|██████████| 314/314 [01:37<00:00,  3.22it/s]


2024년도 성과계획서(총괄편)


100%|██████████| 345/345 [01:40<00:00,  3.44it/s]


월간 나라재정 2023년 12월호


100%|██████████| 68/68 [00:34<00:00,  1.99it/s]


재정통계해설


100%|██████████| 164/164 [00:18<00:00,  8.67it/s]


「FIS 이슈 & 포커스」 22-3호 《재정융자사업》


100%|██████████| 9/9 [00:02<00:00,  3.18it/s]


고용노동부_청년일자리창출지원


100%|██████████| 3/3 [00:01<00:00,  2.17it/s]


1-1 2024 주요 재정통계 1권


100%|██████████| 137/137 [00:11<00:00, 12.16it/s]


보건복지부_노인일자리 및 사회활동지원


100%|██████████| 5/5 [00:00<00:00,  5.10it/s]


국토교통부_민간임대(융자)


100%|██████████| 3/3 [00:01<00:00,  2.14it/s]


고용노동부_내일배움카드(일반)


100%|██████████| 4/4 [00:00<00:00,  6.16it/s]


중소벤처기업부_창업사업화지원


100%|██████████| 2/2 [00:00<00:00,  3.59it/s]


고용노동부_조기재취업수당


100%|██████████| 3/3 [00:01<00:00,  2.69it/s]
