In [13]:
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
from urllib.request import urlopen
from konlpy.tag import Mecab
import numpy as np
import math
import re
import os

In [14]:
def read_pdf_file(pdfFile):
    pdfrm = PDFResourceManager()
    strio = StringIO()
    lapa = LAParams()
    device = TextConverter(pdfrm, strio, laparams = lapa)
    
    process_pdf(pdfrm, device, pdfFile)
    device.close()
    
    content = strio.getvalue()
    strio.close()
    return content

def token (id, name):
    mecab = Mecab()
    stop = ["아니"]

    tokens = [token for token in mecab.pos(id)]
    nodes = [t[0] for t in tokens]
    vocab = [t[0] for t in tokens if t[0] not in stop if t[1] in ['NNG', 'NNP'] and len(t[0]) > 1]

    vocab = list(set(vocab))
    vocab2idx = {vocab[i]:i for i in range(len(vocab))}
    idx2vocab = {i:vocab[i] for i in range(len(vocab))}
    vocab_len = len(vocab2idx)

    # 토큰별로 그래프 edge를 Matrix 형태로 생성
    weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

    # 각 토큰 노드별로 스코어 1로 초기화
    score = np.ones((vocab_len),dtype=np.float32)

    # coocurrence를 판단하기 위한 window 사이즈 설정
    window_size = 4
    covered_coocurrences = []

    for window_start in range(len(nodes) - window_size + 1):
        window = nodes[window_start:window_start+window_size]
        for i in range(window_size):
            for j in range(i+1, window_size):
                if window[i] in vocab and window[j] in vocab:
                    index_i = window_start + i
                    index_j = window_start + j

                    if (index_i, index_j) not in covered_coocurrences:
                        weighted_edge[vocab2idx[window[i]]][vocab2idx[window[j]]] = 1
                        weighted_edge[vocab2idx[window[j]]][vocab2idx[window[i]]] = 1
                        covered_coocurrences.append((index_i, index_j))

    for i in range(vocab_len):
        row_sum = weighted_edge[i].sum()
        weighted_edge[i] = weighted_edge[i]/row_sum if row_sum > 0 else 0

    MAX_ITERATIONS = 50
    d=0.85
    threshold = 0.0001 #convergence threshold

    for iter in range(MAX_ITERATIONS):
        prev_score = np.copy(score)

        for i in range(vocab_len):
            summation = 0
            for j in range(vocab_len):
                if weighted_edge[j][i] != 0:
                    summation += weighted_edge[j][i] * prev_score[j]

            score[i] = (1 - d) * d*summation

        if np.sum(np.fabs(prev_score -  score)) <= threshold:
            break


    sorted_index = np.flip(np.argsort(score), 0)

    n = 100

    name = []
    print("\n=== 핵심키워드 ===")
    for i in range(0,n):
        name.append(str(idx2vocab[sorted_index[i]]))
    
    return name

In [15]:
pdf_BNK = open("ESG_StoA/BNK금융.pdf", "rb")
a = read_pdf_file(pdf_BNK)
pdf_BNK.close() 

token (a, BNK)




=== 핵심키워드 ===


['금융',
 '그룹',
 '지원',
 '지역',
 '경영',
 '고객',
 '사회',
 '관리',
 '기업',
 '운영',
 '활동',
 '정보',
 '강화',
 '평가',
 '환경',
 '교육',
 '투자',
 '리스크',
 '대상',
 '확대',
 '사업',
 '업무',
 '디지털',
 '부산은행',
 '대응',
 '다양',
 '관련',
 '시스템',
 '지속',
 '대출',
 '서비스',
 '추진',
 '이사',
 '실시',
 '가능',
 '친환경',
 '임직원',
 '전략',
 '기술',
 '상품',
 '경남은행',
 '주요',
 '체계',
 '은행',
 '보호',
 '문화',
 '조직',
 '구축',
 '변화',
 '제도',
 '개선',
 '정책',
 '항목',
 '성과',
 '신용',
 '부산',
 '혁신',
 '영업',
 '제공',
 '사항',
 '직원',
 '기후',
 '자금',
 '내부',
 '위원회',
 '윤리',
 '인권',
 '개인',
 '제고',
 '현황',
 '기준',
 '성장',
 '산업',
 '예방',
 '참여',
 '자산',
 '보안',
 '에너지',
 '개발',
 '데이터',
 '시장',
 '경제',
 '대표',
 '발생',
 '기반',
 '분야',
 '선정',
 '적극',
 '중소기업',
 '계열사',
 '과정',
 '이해',
 '사외',
 '글로벌',
 '실적',
 '여신',
 '배출',
 '프로그램',
 '상담',
 '전문']