In [14]:
# _*_ coding: utf-8 _*_

import re
import nltk
import os
from konlpy.tag import Okt
from konlpy.tag import Komoran
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from shutil import rmtree


nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def del_folder(path):
    try:
        rmtree(path)
    except:
        pass


In [16]:
BASE_DIR = "/data/TestDir/sample_articles"
SUMMARY_PATH = os.path.join(BASE_DIR,"Summary-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(BASE_DIR,"Summary-Preprocessed-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

In [17]:
class TextPreprocessor:
    def __init__(self):
        self.retokenize = RegexpTokenizer("[\w]+")
        self.swords = []
        self.tokenizer = {}
        self.tagger = Komoran()

    def removeDuplicateSpace(self, text):
        return re.sub('\s+', ' ', text)  # 중복 공백, 탭, 개행 제거
    
    def removeSpecialChar(self, text):
        return ' '.join(self.retokenize.tokenize(text))


    def loadSwords(self, filename):
        self.swords = []
        with open(filename, 'r', encoding='utf-8') as f:
            swords = f.readlines()
            self.swords = [tag for sword in self.swords for tag in self.tagger.pos(sword.strip()) if tag[1] in ('NNG', 'NNP', 'VV', 'VA')]

        self.tokenizer = lambda sent: filter(lambda x:x not in self.swords and x[1] in ('NNG', 'NNP', 'VV', 'VA'), self.tagger.pos(sent))

        return self.swords
        
    def removeSwords(self, text):
        return ' '.join([noun for (noun, pos) in list(self.tokenizer(text))]) 

In [18]:
def saveTextFile(baseDir, media, filename, sentence):

    mkdir_p(os.path.join(baseDir, media))
    save_path = os.path.join(os.path.join(baseDir, media), filename)

    with open(save_path, 'w', encoding='utf-8') as f:
        if sentence is not '':
            f.write(sentence)

In [19]:
class SummaryReader:
    def __init__(self, media_name, article_name):
        self.filepath = os.path.join(os.path.join(SUMMARY_PATH, media_name),article_name)
        self.media_name = media_name
        self.rgxSplitter = re.compile('([.!?:-](?:["\']|(?![0-9])))')
        self.content = ''

    def get_summary(self):
        with open(self.filepath, encoding='utf-8') as f:
            self.content = f.readline()
            self.del_personal_info(self.media_name)
            
            return self.content

    def del_personal_info(self, media):
        rmBracket = re.sub('(\([^)]*\)|\[[^]]*\])', '', self.content)  # 괄호 안 내용 제거
        rmMedia = re.sub(media, ' ', rmBracket)  # 언론사명 제거
        rmReporter = re.sub('[가-힣]{2,5}\s?기자', ' ', rmMedia) # 기자 이름 제거
        rmEmail = re.sub('[0-9a-zA-Z]([-_\.]?[0-9a-zA-Z])*@[0-9a-zA-Z]([-_\.]?[0-9a-zA-Z])*\.[a-zA-Z]{2,3}', ' ', rmReporter) # 이메일 제거

        self.content = rmEmail


In [20]:
del_folder(SUMMARY_PREPROCESSED_PATH)

preprocessor = TextPreprocessor()
preprocessor.loadSwords(SWORDS_PATH)

if __name__ == '__main__':
    
    media_list = os.listdir(SUMMARY_PATH)

    for media_name in media_list:

        media_path = os.path.join(SUMMARY_PATH, media_name)
        article_list = os.listdir(media_path)

        for article_name in article_list:
            
            reader = SummaryReader(media_name, article_name)
            
            summary = reader.get_summary()
            
            cleanLine = preprocessor.removeDuplicateSpace(summary)
            cleanLine = preprocessor.removeSpecialChar(cleanLine)
                
            rmSwordLine = preprocessor.removeSwords(cleanLine)

            saveTextFile(SUMMARY_PREPROCESSED_PATH, media_name, article_name, rmSwordLine)