In [2]:
!pip install nltk
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 250kB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 7.5MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/af/93f92b38ec1ff3091cd38982ed19cea2800fefb609b5801c41fc43c0781e/JPype1-1.2.1-cp36-cp36m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 58.4MB/s 
[?25hCollecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/67/c3/6bed87f3b1e5ed2f34bd58bf7978e308c86e255193916be76e5a5ce5dfca/tweepy-3.10.0-py2.py3-none-any.whl
Collecting colorama
  Download

In [35]:
import re
import nltk
import os
from konlpy.tag import Okt
from konlpy.tag import Komoran
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from shutil import rmtree


nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [69]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def del_folder(path):
    try:
        rmtree(path)
    except:
        pass


In [70]:

BASE_DIR = "/content/gdrive/My Drive/Colab Notebooks/Text-preprocessing-Data/articles"
ORIGIN_PATH = os.path.join(BASE_DIR,"Origin-Data")
PREPROCESSED_PATH = os.path.join(BASE_DIR,"Preprocessed-Data")
PRETTY_PATH = os.path.join(BASE_DIR,"Pretty-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")


In [71]:
class TextPreprocessor:
    def __init__(self):
        self.retokenize = RegexpTokenizer("[\w]+")
        self.swords = []
        self.tokenizer = {}
        self.tagger = Komoran()

    def cleanContent(self, content, media):
        content = re.sub('\s+', ' ', content)  # 중복 공백, 탭, 개행 제거
        content = re.sub(r'\([^)]*\)', '', content)  # 괄호 안 숫자 제거
        content = content.replace(media, '')  # 언론사명 제거

        return content 
    
    def removeSpecialChar(self, text):
        return ' '.join(self.retokenize.tokenize(text))


    def loadSwords(self, filename):
        self.swords = []
        with open(filename, 'r') as f:
            swords = f.readlines()
            self.swords = [tag for sword in self.swords for tag in self.tagger.pos(sword.strip()) if tag[1] in ('NNG', 'NNP', 'VV', 'VA')]

        self.tokenizer = lambda sent: filter(lambda x:x not in self.swords and x[1] in ('NNG', 'NNP', 'VV', 'VA'), self.tagger.pos(sent))

        return self.swords
        
    def removeSwords(self, text):
        return ' '.join([noun for (noun, pos) in list(self.tokenizer(text))]) 

In [72]:
def saveTextFile(baseDir, media, filename, sentences):

    mkdir_p(os.path.join(baseDir, media))
    save_path = os.path.join(os.path.join(baseDir, media), filename)

    with open(save_path, 'w') as f:
        f.write('/n'.join([sentence for sentence in sentences if sentence is not '']))


In [73]:
class Article:
    def __init__(self, articleInfo):
        self.title = articleInfo[0]
        self.media = articleInfo[1]
        self.content = articleInfo[2:]

    def readContent(self):
        for line in self.content:
            if line is '': continue
            yield line            

In [74]:
class ArticleReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')

    def __iter__(self):
        with open(self.filepath, encoding='utf-8') as f:
            title = f.readline()[:-1]
            yield title
            content = f.readline()[:-1]

            media = f.readline()[:-1]
            yield media

            docs = self.rgxSplitter.split(content)
            for s in map(lambda a, b: a + b, docs[::2], docs[1::2]):
                if not s: continue
                yield s
                


In [75]:
del_folder(PREPROCESSED_PATH)
del_folder(PRETTY_PATH)

preprocessor = TextPreprocessor()
preprocessor.loadSwords(SWORDS_PATH)

if __name__ == '__main__':
    media_list = os.listdir(ORIGIN_PATH)

    for media in media_list:

        media_path = os.path.join(ORIGIN_PATH, media)
        article_list = os.listdir(media_path)

        for article_name in article_list:
            
            reader = ArticleReader(os.path.join(media_path, article_name))
            article = Article(list(filter(None, reader)))

            prettyLine = []
            preprocessedLine = []
            for line in article.readContent() :
                cleanLine = preprocessor.cleanContent(line, media)
                cleanLine = preprocessor.removeSpecialChar(cleanLine)
                
                rmSwordLine = preprocessor.removeSwords(cleanLine)

                preprocessedLine.append(rmSwordLine)

            saveTextFile(PREPROCESSED_PATH, media, article_name, preprocessedLine)
            saveTextFile(PRETTY_PATH, media, article_name, article.readContent())