In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Komoran
import networkx
import re
from shutil import rmtree
import os
import math

In [26]:
BASE_DIR = "/data/TestDir/sample_articles"
PREPROCESSED_PATH = os.path.join(BASE_DIR,"Preprocessed-Data")
PRETTY_PATH = os.path.join(BASE_DIR,"Pretty-Data")
ORIGIN_PATH = os.path.join(BASE_DIR,"Origin-Data")
SUMMARY_PATH = os.path.join(BASE_DIR,"Summary-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

In [33]:
class RawTextReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile("/n")

    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in ch:
                yield s


class Document:
    def __init__(self, originSentenceIter, procSentenceIter):
        self.originSents = list(filter(None, originSentenceIter))
        self.procSents = list(filter(None, procSentenceIter))

    def getOriginSet(self):
        return self.originSents

    def getSentsZip(self):
        return zip(self.originSents, self.procSents)


class TextRank:
    def __init__(self, **kargs):
        self.graph = None
        self.coef = kargs.get('coef', 1.0)
        self.threshold = kargs.get('threshold', 0.005)
        self.dictCount = {}
        self.dictBiCount = {}

        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_matrix = {}

    def loadSents(self, document, tokenizer):

        sentSet = []
        
        for origin, proc in document.getSentsZip():
            tagged = set(filter(None, tokenizer(proc)))
            if len(tagged) < 2: continue
            self.dictCount[len(self.dictCount)] = origin
            sentSet.append(tagged)
        
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(document.getOriginSet())

        sents_distances = (self.tfidf_matrix * self.tfidf_matrix.T)
        sents_distances_matrix = sents_distances.toarray()

        for i in range(len(self.dictCount)):
            for j in range(i + 1, len(self.dictCount)):
                similarity = sents_distances_matrix[i, j]

                if similarity < self.threshold: continue
                self.dictBiCount[i, j] = similarity

    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictCount.keys())
        for (a, b), n in self.dictBiCount.items():
            self.graph.add_edge(a, b, weight=n * self.coef + (1 - self.coef))

    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')

    def summarize(self, ratio=0.333):
        r = self.rank()
        ks = sorted(r, key=r.get, reverse=True)[:int(len(r) * ratio)]
        return ' '.join(map(lambda k: self.dictCount[k], sorted(ks)))

In [15]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def del_folder(path):
    try:
        rmtree(path)
    except:
        pass

In [23]:
def saveTextFile(baseDir, media, filename, sentence):

    mkdir_p(os.path.join(baseDir, media))
    save_path = os.path.join(os.path.join(baseDir, media), filename)

    with open(save_path, 'w') as f:
        f.write(sentence)

In [38]:
if __name__ == '__main__':

    del_folder(SUMMARY_PATH)
    media_list = os.listdir(PREPROCESSED_PATH)

    for media_idx, media in enumerate(media_list) :

        origin_article_list = os.listdir(os.path.join(PRETTY_PATH, media))
        proc_article_list = os.listdir(os.path.join(PREPROCESSED_PATH, media))

        print(media_idx, media)
        for article_name in origin_article_list :

            origin_article_path = os.path.join(os.path.join(PRETTY_PATH, media), article_name)
            proc_article_path = os.path.join(os.path.join(PREPROCESSED_PATH, media), article_name)
            
            try :
                tr = TextRank()
                tagger = Komoran()
                
                tr.loadSents(Document(RawTextReader(origin_article_path), RawTextReader(proc_article_path)),
                         lambda sent: filter(lambda x: x[1] in ('NNG', 'NNP', 'VV', 'VA'),
                                             tagger.pos(sent)))
                tr.build()
                ranks = tr.rank()
            
                #for k in sorted(ranks, key=ranks.get, reverse=True)[:100]:
                #    print("\t".join([str(k), str(ranks[k]), str(tr.dictCount[k])]))
                #summary = tr.summarize(0.2)
            
                rank_order = sorted(ranks, key=ranks.get, reverse=True)[:100]
                summary_line = str(tr.dictCount[rank_order[0]])
            
                saveTextFile(SUMMARY_PATH, media, article_name, summary_line)
                
            except Exception:
                print(article_name)
                pass

0 MBN
1246.txt
1247.txt
1248.txt
1249.txt
1353.txt
1354.txt
1355.txt
1356.txt
1357.txt
1358.txt
1359.txt
136.txt
1360.txt
1361.txt
1362.txt
1363.txt
1364.txt
1365.txt
1366.txt
1367.txt
1368.txt
1369.txt
137.txt
1370.txt
1371.txt
1372.txt
1373.txt
1374.txt
1375.txt
1376.txt
1377.txt
1378.txt
1379.txt
138.txt
1380.txt
1381.txt
1382.txt
1383.txt
1384.txt
1385.txt
1386.txt
1387.txt
1388.txt
1389.txt
139.txt
1390.txt
1391.txt
1392.txt
1393.txt
1394.txt
1395.txt
1396.txt
1397.txt
1398.txt
1399.txt
14.txt
140.txt
1400.txt
1401.txt
1402.txt
1403.txt
1404.txt
1405.txt
1406.txt
1407.txt
1408.txt
1409.txt
141.txt
1410.txt
1411.txt
1412.txt
1413.txt
1414.txt
1415.txt
1416.txt
1417.txt
1418.txt
1419.txt
142.txt
1420.txt
1421.txt
1422.txt
1423.txt
1424.txt
1425.txt
1426.txt
1427.txt
1428.txt
1429.txt
143.txt
1430.txt
1431.txt
1432.txt
1433.txt
1434.txt
1435.txt
1436.txt
1437.txt
1438.txt
1439.txt
144.txt
1440.txt
1441.txt
1442.txt
1443.txt
1444.txt
1445.txt
1446.txt
1447.txt
1448.txt
1449.txt
145.tx