In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Komoran
import networkx
import re
from shutil import rmtree
import os
import math

In [2]:
BASE_DIR = "/data/TestDir/sample_articles"
PREPROCESSED_PATH = os.path.join(BASE_DIR,"Preprocessed-Data")
PRETTY_PATH = os.path.join(BASE_DIR,"Pretty-Data")
ORIGIN_PATH = os.path.join(BASE_DIR,"Origin-Data")
SUMMARY_PATH = os.path.join(BASE_DIR,"Summary-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

In [3]:
class RawTextReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile("/n")

    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in ch:
                yield s


class Document:
    def __init__(self, originSentenceIter, procSentenceIter):
        self.originSents = list(filter(None, originSentenceIter))
        self.procSents = list(filter(None, procSentenceIter))

    def getOriginSet(self):
        return self.originSents

    def getSentsZip(self):
        return zip(self.originSents, self.procSents)


class TextRank:
    def __init__(self, **kargs):
        self.graph = None
        self.coef = kargs.get('coef', 1.0)
        self.threshold = kargs.get('threshold', 0.005)
        self.dictCount = {}
        self.dictBiCount = {}

        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_matrix = {}

    def loadSents(self, document, tokenizer):

        sentSet = []
        
        for origin, proc in document.getSentsZip():
            tagged = set(filter(None, tokenizer(proc)))
            if len(tagged) < 2: continue
            self.dictCount[len(self.dictCount)] = origin
            sentSet.append(tagged)
        
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(document.getOriginSet())

        sents_distances = (self.tfidf_matrix * self.tfidf_matrix.T)
        sents_distances_matrix = sents_distances.toarray()

        for i in range(len(self.dictCount)):
            for j in range(i + 1, len(self.dictCount)):
                similarity = sents_distances_matrix[i, j]

                if similarity < self.threshold: continue
                self.dictBiCount[i, j] = similarity

    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictCount.keys())
        for (a, b), n in self.dictBiCount.items():
            self.graph.add_edge(a, b, weight=n * self.coef + (1 - self.coef))

    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')

    def summarize(self, ratio=0.333):
        r = self.rank()
        ks = sorted(r, key=r.get, reverse=True)[:int(len(r) * ratio)]
        return ' '.join(map(lambda k: self.dictCount[k], sorted(ks)))

In [4]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def del_folder(path):
    try:
        rmtree(path)
    except:
        pass

In [8]:
def saveTextFile(baseDir, media, filename, sentence):

    mkdir_p(os.path.join(baseDir, media))
    save_path = os.path.join(os.path.join(baseDir, media), filename)

    with open(save_path, 'w') as f:
        f.write(sentence)

In [None]:
if __name__ == '__main__':

    del_folder(SUMMARY_PATH)
    media_list = sorted(os.listdir(PREPROCESSED_PATH))

    for media_idx, media_name in enumerate(media_list) :

        origin_article_list = sorted(os.listdir(os.path.join(PRETTY_PATH, media_name)))
        proc_article_list = sorted(os.listdir(os.path.join(PREPROCESSED_PATH, media_name)))

        print("media idx : {idx}, meida_name : {name}".format(idx=media_idx, name=media_name))
        for article_idx, article_name in enumerate(proc_article_list) :
            
            print("article name : {name}, ({ratio}%)".format(name=article_name, 
                                                     ratio=(article_idx + 1)* 100 / len(proc_article_list)))
            
            origin_article_path = os.path.join(os.path.join(PRETTY_PATH, media_name), article_name)
            proc_article_path = os.path.join(os.path.join(PREPROCESSED_PATH, media_name), article_name)
            
            try :
                tr = TextRank()
                tagger = Komoran()
                
                tr.loadSents(Document(RawTextReader(origin_article_path), RawTextReader(proc_article_path)),
                         lambda sent: filter(lambda x: x[1] in ('NNG', 'NNP', 'VV', 'VA'),
                                             tagger.pos(sent)))
                tr.build()
                ranks = tr.rank()
            
                rank_order = sorted(ranks, key=ranks.get, reverse=True)[:100]
                summary_line = str(tr.dictCount[rank_order[0]])
            
                saveTextFile(SUMMARY_PATH, media_name, article_name, summary_line)
                
            except Exception as e:
                import errno
                if e.errno == errno.ENOENT :
                    print(article_name, e)
                    pass

media idx : 0, meida_name : AP연합뉴스
article name : 1.txt, (3.125%)
article name : 12.txt, (6.25%)
article name : 14.txt, (9.375%)
article name : 16.txt, (12.5%)
article name : 17.txt, (15.625%)
article name : 18.txt, (18.75%)
article name : 19.txt, (21.875%)
article name : 20.txt, (25.0%)
article name : 21.txt, (28.125%)
article name : 22.txt, (31.25%)
article name : 29.txt, (34.375%)
article name : 30.txt, (37.5%)
article name : 32.txt, (40.625%)
article name : 33.txt, (43.75%)
article name : 35.txt, (46.875%)
article name : 36.txt, (50.0%)
article name : 39.txt, (53.125%)
article name : 40.txt, (56.25%)
article name : 42.txt, (59.375%)
article name : 43.txt, (62.5%)
article name : 45.txt, (65.625%)
article name : 46.txt, (68.75%)
article name : 49.txt, (71.875%)
article name : 5.txt, (75.0%)
article name : 52.txt, (78.125%)
article name : 56.txt, (81.25%)
article name : 57.txt, (84.375%)
article name : 58.txt, (87.5%)
article name : 6.txt, (90.625%)
article name : 7.txt, (93.75%)
arti