In [34]:
# _*_ coding: utf-8 _*_

import re
import nltk
import os
import csv
import pandas as pd
from glob import iglob
from konlpy.tag import Komoran
from nltk.tokenize import RegexpTokenizer

import import_ipynb
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.ArticleHandler import Article, ArticleReader


nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
BASE_DIR = "/data/ksb/TestSampleDir/articles"
SUMMARY_PATH = os.path.join(BASE_DIR,"Summary-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(BASE_DIR,"Summary-Preprocessed-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

In [36]:
class TextPreprocessor:
    def __init__(self):
        self.retokenize = RegexpTokenizer("[\w]+")
        self.swords = []
        self.tokenizer = {}
        self.tagger = Komoran()

    def removeDuplicateSpace(self, text):
        return re.sub('\s+', ' ', text)  # 중복 공백, 탭, 개행 제거
    
    def removeSpecialChar(self, text):
        return ' '.join(self.retokenize.tokenize(text))


    def loadSwords(self, filename):
        self.swords = []
        with open(filename, 'r', encoding='utf-8') as f:
            self.swords = f.readlines()
            self.swords = [sword.strip() for sword in self.swords]

        self.tokenizer = lambda sent: filter(lambda x:x not in self.swords, sent.split())

        return self.swords
        
    def removeSwords(self, text):
        return ' '.join([token for token in list(self.tokenizer(text))])
    
    def cleanLines(self, lines):
        result = []
        for line in lines :
            cleanLine = self.removeDuplicateSpace(line)
            cleanLine = self.removeSpecialChar(cleanLine)
                
            rmSwordLine = self.removeSwords(cleanLine)
            if rmSwordLine is '': continue
                    
            result.append(rmSwordLine)
        return result

In [37]:
class SummaryReader:
    def __init__(self, media_name, article_name):
        self.filepath = os.path.join(SUMMARY_PATH, media_name) + ".csv"
        self.media_name = media_name
        self.rgxSplitter = re.compile('([.!?:-](?:["\']|(?![0-9])))')
        self.content = ''

    def get_summary(self):
        with open(self.filepath, encoding='utf-8') as f:
            self.content = f.readline()
            self.del_personal_info(self.media_name)
            
            return self.content

    def del_personal_info(self, media):
        rmBracket = re.sub('(\([^)]*\)|\[[^]]*\])', '', self.content)  # 괄호 안 내용 제거
        rmMedia = re.sub(media, ' ', rmBracket)  # 언론사명 제거
        rmReporter = re.sub('[가-힣]{2,5}\s?기자', ' ', rmMedia) # 기자 이름 제거
        rmEmail = re.sub('[0-9a-zA-Z]([-_\.]?[0-9a-zA-Z])*@[0-9a-zA-Z]([-_\.]?[0-9a-zA-Z])*\.[a-zA-Z]{2,3}', ' ', rmReporter) # 이메일 제거

        self.content = rmEmail


In [38]:
def saveCSVFile(baseDir, media, article_dist):
    save_path = os.path.join(baseDir, media) + ".csv"

    article_dist.to_csv(save_path, mode='w', header=False)

In [39]:
def get_media_name(filepath):
    filename = filepath.split(os.sep)[-1]
    return filename.split(".")[0]

In [40]:
del_folder(SUMMARY_PREPROCESSED_PATH)

In [41]:
preprocessor = TextPreprocessor()
preprocessor.loadSwords(SWORDS_PATH)

['아',
 '휴',
 '아이구',
 '아이쿠',
 '아이고',
 '어',
 '나',
 '우리',
 '저희',
 '따라',
 '의해',
 '을',
 '를',
 '에',
 '의',
 '가',
 '으로',
 '로',
 '에게',
 '뿐이다',
 '의거하여',
 '근거하여',
 '입각하여',
 '기준으로',
 '예하면',
 '예를 들면',
 '예를 들자면',
 '저',
 '소인',
 '소생',
 '저희',
 '지말고',
 '하지마',
 '하지마라',
 '다른',
 '물론',
 '또한',
 '그리고',
 '비길수 없다',
 '해서는 안된다',
 '뿐만 아니라',
 '만이 아니다',
 '만은 아니다',
 '막론하고',
 '관계없이',
 '그치지 않다',
 '그러나',
 '그런데',
 '하지만',
 '든간에',
 '논하지 않다',
 '따지지 않다',
 '설사',
 '비록',
 '더라도',
 '아니면',
 '만 못하다',
 '하는 편이 낫다',
 '불문하고',
 '향하여',
 '향해서',
 '향하다',
 '쪽으로',
 '틈타',
 '이용하여',
 '타다',
 '오르다',
 '제외하고',
 '이 외에',
 '이 밖에',
 '하여야',
 '비로소',
 '한다면 몰라도',
 '외에도',
 '이곳',
 '여기',
 '부터',
 '기점으로',
 '따라서',
 '할 생각이다',
 '하려고하다',
 '이리하여',
 '그리하여',
 '그렇게 함으로써',
 '하지만',
 '일때',
 '할때',
 '앞에서',
 '중에서',
 '보는데서',
 '으로써',
 '로써',
 '까지',
 '해야한다',
 '일것이다',
 '반드시',
 '할줄알다',
 '할수있다',
 '할수있어',
 '임에 틀림없다',
 '한다면',
 '등',
 '등등',
 '제',
 '겨우',
 '단지',
 '다만',
 '할뿐',
 '딩동',
 '댕그',
 '대해서',
 '대하여',
 '대하면',
 '훨씬',
 '얼마나',
 '얼마만큼',
 '얼마큼',
 '남짓',
 '여',
 '얼마간',
 '약간',
 '다소',
 '좀',
 '조

In [47]:
del_folder(SUMMARY_PREPROCESSED_PATH)
mkdir_p(SUMMARY_PREPROCESSED_PATH)

if __name__ == '__main__':
    
    for idx, media_path in enumerate(iglob(os.path.join(SUMMARY_PATH, '**.csv'), recursive=False)):

        media_name = get_media_name(media_path)
        preprocessed_path = os.path.join(SUMMARY_PREPROCESSED_PATH, media_name) + ".csv"
        print(media_name, preprocessed_path)
        
        f = open(media_path, 'r', newline="\n", encoding="utf-8")
        rdr = csv.reader(f)
        summary_processed_dist = pd.DataFrame(columns=['Title', 'Contents'])
        
        for [_, _, title, content] in rdr:
            article = Article(title, media_name, content.split("\t"))
            if not article.content : continue # 본문이 없는 경우를 제외함
            clean_conts = preprocessor.cleanLines(article.content)
            
            summary_processed= {'Title' : article.title, 'Contents' : '\t'.join(clean_conts)}
            summary_processed_dist = summary_processed_dist.append(summary_processed, ignore_index=True)

        saveCSVFile(SUMMARY_PREPROCESSED_PATH, media_name, summary_processed_dist)


중앙일보 /data/ksb/TestSampleDir/articles/Summary-Preprocessed-Data/중앙일보.csv
