### **필요한 라이브러리 설치**
  

> jpye1 : konlpy 설치를 위해 필요한 라이브러리  
>  konlpy : 한글 텍스트 전처리 라이브러리  
>  scikit-learn : TF-IDF 테이블 생성을 위해 필요한 라이브러리

In [1]:
!pip install jpype1
!pip install konlpy
!pip install scikit-learn

Collecting jpype1
[?25l  Downloading https://files.pythonhosted.org/packages/de/af/93f92b38ec1ff3091cd38982ed19cea2800fefb609b5801c41fc43c0781e/JPype1-1.2.1-cp36-cp36m-manylinux2010_x86_64.whl (457kB)
[K     |▊                               | 10kB 23.2MB/s eta 0:00:01[K     |█▍                              | 20kB 10.8MB/s eta 0:00:01[K     |██▏                             | 30kB 8.3MB/s eta 0:00:01[K     |██▉                             | 40kB 7.5MB/s eta 0:00:01[K     |███▋                            | 51kB 4.3MB/s eta 0:00:01[K     |████▎                           | 61kB 4.8MB/s eta 0:00:01[K     |█████                           | 71kB 4.9MB/s eta 0:00:01[K     |█████▊                          | 81kB 5.3MB/s eta 0:00:01[K     |██████▌                         | 92kB 5.7MB/s eta 0:00:01[K     |███████▏                        | 102kB 4.2MB/s eta 0:00:01[K     |███████▉                        | 112kB 4.2MB/s eta 0:00:01[K     |████████▋                       | 12

In [2]:
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import os
import re
import nltk
import os
from konlpy.tag import Okt
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from shutil import rmtree

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### 경로 설정

BASE_DIR : 원본 데이터, 전처리 데이터를 저장하는 상위 디렉토리 경로  
TARGET_PATH : 전처리 데이터를 저장하는 경로  
ARTICLE_MEDIA_PATH : 원본 데이터를 저장하는 경로  
SWORDS_FILE_PATH : 불용어 리스트를 저장하는 텍스트 파일 경로  

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
BASE_DIR = "/content/gdrive/My Drive/Colab Notebooks/Text-preprocessing-Data/"
TARGET_PATH = os.path.join(BASE_DIR,"preprocessed")
ARTICLE_MEDIA_PATH = os.path.join(BASE_DIR,"articles")
SWORDS_FILE_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

디렉토리 생성 및 삭제 함수

In [5]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def del_folder(path):
    try:
        rmtree(path)
    except:
        pass


In [6]:
def text2Sentences(text):
  return text.split('/')

def sentences2Text(sentences):
  return '/'.join([sentence for sentence in sentences])

In [7]:
def readArticle(filename):

    f = open(filename, 'r', encoding='utf-8')
    title = f.readline()[:-1]
    content = f.readline()[:-1]
    media = f.readline()[:-1]
    f.close()

    return title, media, content

def cleanContent(content, media):
    content = re.sub('\s+', ' ', content)  # 중복 공백, 탭, 개행 제거
    content = re.sub(r'\([^)]*\)', '', content)  # 괄호 안 숫자 제거
    content = content.replace(media, '')  # 언론사명 제거

    return content

def removeSpecialChar(text):
    retokenize = RegexpTokenizer("[\w]+")
    return ' '.join(retokenize.tokenize(text))

def getStopWord(swords_filename):
    swords = []
    with open(swords_filename, 'r') as f:
        swords = f.readlines()
        swords = [sword.strip() for sword in swords]

    return swords

def delStopWord(sentence):
    if sentence is '':
        return None

    okt = Okt()
    swords = getStopWord(SWORDS_FILE_PATH)
    return ' '.join([word for word in okt.morphs(sentence) if word not in swords and len(word) > 1])

def getRmSwordSentences(sentences):
    rmSwordSentences = []
    for sentence in sentences:
        sentence = delStopWord(sentence)
        if sentence is not None : rmSwordSentences.append(sentence)
        print(len(rmSwordSentences))
    return rmSwordSentences

def getNouns(sentences):
  
  okt = Okt()
  swords = getStopWord(SWORDS_FILE_PATH)

  nouns = []
  for sentence in sentences:
    if sentence is not '':
      nouns.append(' '.join([noun for noun in okt.morphs(sentence) if noun not in swords and len(noun) > 1]))
  return nouns


def savePreprocessedText(media, article, nouns):

    mkdir_p(os.path.join(TARGET_PATH, media))
    save_path = os.path.join(os.path.join(TARGET_PATH, media), article)

    with open(save_path, 'w') as f:
        f.write(title)
        preprocessed = ""
        for noun in nouns:
            preprocessed += noun + "/"
        f.write(preprocessed)
        print(preprocessed+"\n")


In [8]:
class GraphMatrix(object):
  def __init__(self):
    self.tfidf = TfidfVectorizer()
    self.cnt_vec = CountVectorizer()
    self.graph_sentence = []
  
  def build_sent_graph(self, sentences):

    tfidf_mat = self.tfidf.fit_transform(sentences).toarray()
    
    self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
    return self.graph_sentence
  
  def build_words_graph(self, sentence):
    cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)

    vocab = self.cnt_vec.vocabulary_
    return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

In [9]:
class Rank(object):
  def get_ranks(self, graph, d=0.85):
    A = graph
    matrix_size = A.shape[0]
    for id in range(matrix_size):
      A[id,id] = 0
      link_sum = np.sum(A[:,id])
      if link_sum != 0:
        A[:, id] /= link_sum
      A[:, id] *= -d
      A[id, id] = 1
    B = (1-d) * np.ones((matrix_size, 1))
    ranks = np.linalg.solve(A, B)
    return {idx: r[0] for idx, r in enumerate(ranks)}
  

In [10]:
class TextRank(object):
  def __init__(self, text):
    self.sentences = text2Sentences(text)
    print("Get Sentences")
    self.nouns = getNouns(self.sentences)

    print("Get Nouns")
    self.graph_matrix = GraphMatrix()
    print("Construct GraphMatrix")
    self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
    print("Sent Graph")
    self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)
    print("Word Graph")
    self.rank = Rank()
    print("Construct Rank")
    self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
    self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)

    self.word_rank_idx = self.rank.get_ranks(self.words_graph)
    self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx)

  def summarize(self, sent_num=3):
    summary = []
    index =[]
    for idx in self.sorted_sent_rank_idx[:sent_num]:
      index.append(idx)
    index.sort()

    for idx in index:
      summary.append(self.sentences[idx])

    return summary
  
  def keywords(self, word_num=10):
    rank = Rank()
    rank_idx = rank.get_ranks(self.words_graph)

    sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

    keywords = []
    index = []
    for idx in sorted_rank_idx[:word_num]:
      index.append(idx)
    
    for idx in index:
      keywords.append(self.idx2word[idx])
    return keywords

In [None]:
media_list = os.listdir(ARTICLE_MEDIA_PATH)
media_path= os.path.join(ARTICLE_MEDIA_PATH, media_list[0])

document = os.listdir(media_path)[0]
title, media, content = readArticle(os.path.join(media_path, document))
content = cleanContent(content, media)

sentences = sent_tokenize(content)
sentences = [removeSpecialChar(sentence) for sentence in sentences]
text = sentences2Text(sentences)

textrank = TextRank(text)

Get Sentences


In [None]:
for row in textrank.summarize(2):
  print(row+'\n')