In [1]:
!pip install konlpy

[33mYou are using pip version 10.0.1, however version 20.3.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
# _*_ coding: utf-8 _*_

import re
import nltk
import os
from konlpy.tag import Okt
from konlpy.tag import Komoran
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from shutil import rmtree


nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def del_folder(path):
    try:
        rmtree(path)
    except:
        pass


In [11]:
BASE_DIR = "/data/TestDir/sample_articles"
ORIGIN_PATH = os.path.join(BASE_DIR,"Origin-Data")
PREPROCESSED_PATH = os.path.join(BASE_DIR,"Preprocessed-Data")
PRETTY_PATH = os.path.join(BASE_DIR,"Pretty-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

In [6]:
class TextPreprocessor:
    def __init__(self):
        self.retokenize = RegexpTokenizer("[\w]+")
        self.swords = []
        self.tokenizer = {}
        self.tagger = Komoran()

    def removeDuplicateSpace(self, text):
        return re.sub('\s+', ' ', text)  # 중복 공백, 탭, 개행 제거
    
    def removeSpecialChar(self, text):
        return ' '.join(self.retokenize.tokenize(text))


    def loadSwords(self, filename):
        self.swords = []
        with open(filename, 'r', encoding='utf-8') as f:
            swords = f.readlines()
            self.swords = [tag for sword in self.swords for tag in self.tagger.pos(sword.strip()) if tag[1] in ('NNG', 'NNP', 'VV', 'VA')]

        self.tokenizer = lambda sent: filter(lambda x:x not in self.swords and x[1] in ('NNG', 'NNP', 'VV', 'VA'), self.tagger.pos(sent))

        return self.swords
        
    def removeSwords(self, text):
        return ' '.join([noun for (noun, pos) in list(self.tokenizer(text))]) 

In [7]:
def saveTextFile(baseDir, media, filename, sentences):

    mkdir_p(os.path.join(baseDir, media))
    save_path = os.path.join(os.path.join(baseDir, media), filename)

    with open(save_path, 'w', encoding='utf-8') as f:
        f.write('/n'.join([sentence for sentence in sentences if sentence is not '']))


In [8]:
class Article:
    def __init__(self, articleInfo):
        self.title = articleInfo[0]
        self.media = articleInfo[1]
        self.content = articleInfo[2:]

    def readContent(self):
        for line in self.content:
            if line is '': continue
            yield line            

In [22]:
class ArticleReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:-](?:["\']|(?![0-9])))')
        self.content = ''

    def __iter__(self):
        def is_splited_sentence(sentences):
            return len(sentences) > 1
        
        with open(self.filepath, encoding='utf-8') as f:
            title = f.readline()[:-1]
            yield title
            self.content = f.readline()[:-1]

            media = f.readline()[:-1]
            yield media

            self.del_personal_info(media)
            docs = self.rgxSplitter.split(self.content)
            
            if not is_splited_sentence(docs): # 본문이 1줄이며, 위 정규식에 따라 split 되지 않음
                yield docs[0]
            else :
                for s in map(lambda a, b: a + b, docs[::2], docs[1::2]):
                    if not s: continue
                    yield s

    def del_personal_info(self, media):
        rmBracket = re.sub('(\([^)]*\)|\[[^]]*\])', '', self.content)  # 괄호 안 내용 제거
        rmMedia = re.sub(media, ' ', rmBracket)  # 언론사명 제거
        rmReporter = re.sub('[가-힣]{2,5}\s?기자', ' ', rmMedia) # 기자 이름 제거
        rmEmail = re.sub('[0-9a-zA-Z]([-_\.]?[0-9a-zA-Z])*@[0-9a-zA-Z]([-_\.]?[0-9a-zA-Z])*\.[a-zA-Z]{2,3}', ' ', rmReporter) # 이메일 제거

        self.content = rmEmail


In [None]:
del_folder(PREPROCESSED_PATH)
del_folder(PRETTY_PATH)

preprocessor = TextPreprocessor()
preprocessor.loadSwords(SWORDS_PATH)

if __name__ == '__main__':
    
    media_list = sorted(os.listdir(ORIGIN_PATH))

    for media_name in media_list:

        media_path = os.path.join(ORIGIN_PATH, media_name)
        article_list = sorted(os.listdir(media_path))

        for article_name in article_list:
            reader = ArticleReader(os.path.join(media_path, article_name))

            article = Article(list(filter(None, reader)))
            
            if len(article.content) is 0 : continue # 본문이 없는 경우, 학습 데이터에서 제외함
            
            prettyLine = []
            preprocessedLine = []
            for line in article.readContent() :
                cleanLine = preprocessor.removeDuplicateSpace(line)
                cleanLine = preprocessor.removeSpecialChar(cleanLine)
                
                rmSwordLine = preprocessor.removeSwords(cleanLine)
                if rmSwordLine is '': continue
                    
                preprocessedLine.append(rmSwordLine)

            saveTextFile(PREPROCESSED_PATH, media_name, article_name, preprocessedLine)
            saveTextFile(PRETTY_PATH, media_name, article_name, article.readContent())