<a href="https://colab.research.google.com/github/meteor79/TPS/blob/master/TextRank_doc_summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install newspaper3k # url 크롤링 패키지
!pip install jpype1
!pip install konlpy # 한글 형태소 분석기
!pip install scikit-learn # TF-IDF를 위한 머신러닝 패키지
!pip install html2text

Collecting newspaper3k
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b9/51afecb35bb61b188a4b44868001de348a0e8134b4dfa00ffc191567c4b9/newspaper3k-0.2.8-py3-none-any.whl (211kB)
[K     |█▌                              | 10kB 16.1MB/s eta 0:00:01[K     |███                             | 20kB 1.7MB/s eta 0:00:01[K     |████▋                           | 30kB 2.2MB/s eta 0:00:01[K     |██████▏                         | 40kB 1.6MB/s eta 0:00:01[K     |███████▊                        | 51kB 1.8MB/s eta 0:00:01[K     |█████████▎                      | 61kB 2.1MB/s eta 0:00:01[K     |██████████▉                     | 71kB 2.3MB/s eta 0:00:01[K     |████████████▍                   | 81kB 2.5MB/s eta 0:00:01[K     |██████████████                  | 92kB 2.8MB/s eta 0:00:01[K     |███████████████▌                | 102kB 2.7MB/s eta 0:00:01[K     |█████████████████               | 112kB 2.7MB/s eta 0:00:01[K     |██████████████████▋             | 122kB 2.7MB/

In [0]:
# -*- coding: utf-8 -*-

from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import requests
import html2text


def getHtmlText(url):
    try:
        headers = {'User-agent': 'Mozilla/5.0'}
        page = requests.get(url, headers=headers)        
        html_code = page.content        
        h = html2text.HTML2Text()                 
        h.ignore_links = True
        h.ignore_images = True
        h.single_line_break = True                     
        text = h.handle(html_code.decode("utf-8"))
        return text
    except Exception as e:
        print(e)
        return null

# 문장 추출
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.Okt = Okt()
           
    def url2sentences(self, url):
        htmltext = getHtmlText(url)
        sentences = self.kkma.sentences(htmltext)
        # for idx in range(0, len(sentences)):
        #     if len(sentences[idx]) <= 10:
        #         sentences[idx-1] += (' ' + sentences[idx])
        #         sentences[idx] = ''        
        return sentences
  
    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)      
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        return sentences

    def get_nouns(self, sentences):
        '''KoNLPy로 형태소 분석하기''' # --- ( ※ 3) 
        nouns = []
        for sentence in sentences:
            word_s = self.Okt.pos(sentence, norm=True, stem=True)
            for n, h in word_s:
                if not (h in ['Noun']): continue
                if h == 'Punctuation' and h2 == 'Number': continue
                nouns.append(n)
        return nouns


# TF-IDF 모델 생성 및 그래프 생성
class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        return self.graph_sentence
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}



#TextRank 알고리즘 적용
class Rank(object):
    def get_ranks(self, graph, d=0.85): 
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0 
            link_sum = np.sum(A[:,id])
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1
        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) 
        return {idx: r[0] for idx, r in enumerate(ranks)}


#TextRank Class 구현
class TextRank(object):
    def __init__(self, text):
        self.sent_tokenize = SentenceTokenizer()
        
        if text[:5] in ('http:', 'https'):
            self.sentences = self.sent_tokenize.url2sentences(text)
        else:
            self.sentences = self.sent_tokenize.text2sentences(text)
  
        self.nouns = self.sent_tokenize.get_nouns(self.sentences)
        
        print(self.nouns)
        # self.graph_matrix = GraphMatrix()
        # self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        # self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)

        # self.rank = Rank()
        # self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        # self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)

        # self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        # self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
    
    def summarize(self, sent_num=3):
        summary = []
        index=[]
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)
        index.sort()

        for idx in index:
            summary.append(self.sentences[idx])
        return summary

    def keywords(self, word_num=10):
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

        keywords = []
        index=[]
        
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)

        #index.sort()
        for idx in index:
            keywords.append(self.idx2word[idx])

        return keywords


In [0]:
def run_text_rank(url, summarize_num):
    textrank = TextRank(url)
    rows = textrank.summarize(summarize_num)
    print (rows)
    # for row in :  # 몇 줄로 요약할꺼야
    #     print(row)
    #     print('\n')
    #     print('keywords :')
    #     print(textrank.keywords())

In [0]:
run_text_rank("http://www.diningcode.com/profile.php?rid=BpZCgY9DffCt", 3)


TypeError: ignored