In [17]:
# _*_ coding: utf-8 _*_
import re
import nltk
import os
import csv
import math
import pandas as pd
import numpy as np
from glob import iglob
from functools import reduce
from konlpy.tag import Komoran
from gensim.summarization.summarizer import summarize


nltk.download('punkt')

import import_ipynb
from CommonModule.Handle_Dir import mkdir_p, del_folder
from CommonModule.ArticleHandler import Article, ArticleReader
from CommonModule.TextPreprocessor import TextPreprocessor

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
BASE_DIR = "/data/ksb/TestDir/articles"
ORIGIN_PATH = os.path.join(BASE_DIR,"Origin-Data")
PREPROCESSED_PATH = os.path.join(BASE_DIR,"Preprocessed-Data")
SUMMARY_PATH = os.path.join(BASE_DIR,"Summary-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(BASE_DIR,"Summary-Preprocessed-Data")
SWORDS_PATH = os.path.join(BASE_DIR, "StopWordList.txt")

In [19]:
del_folder(PREPROCESSED_PATH)
mkdir_p(PREPROCESSED_PATH)

In [20]:
del_folder(SUMMARY_PATH)
mkdir_p(SUMMARY_PATH)

In [21]:
del_folder(SUMMARY_PREPROCESSED_PATH)
mkdir_p(SUMMARY_PREPROCESSED_PATH)

In [22]:
preprocessor = TextPreprocessor()
preprocessor.loadSwords(SWORDS_PATH)

['아',
 '휴',
 '아이구',
 '아이쿠',
 '아이고',
 '어',
 '나',
 '우리',
 '저희',
 '따라',
 '의해',
 '을',
 '를',
 '에',
 '의',
 '가',
 '으로',
 '로',
 '에게',
 '뿐이다',
 '의거하여',
 '근거하여',
 '입각하여',
 '기준으로',
 '예하면',
 '예를 들면',
 '예를 들자면',
 '저',
 '소인',
 '소생',
 '저희',
 '지말고',
 '하지마',
 '하지마라',
 '다른',
 '물론',
 '또한',
 '그리고',
 '비길수 없다',
 '해서는 안된다',
 '뿐만 아니라',
 '만이 아니다',
 '만은 아니다',
 '막론하고',
 '관계없이',
 '그치지 않다',
 '그러나',
 '그런데',
 '하지만',
 '든간에',
 '논하지 않다',
 '따지지 않다',
 '설사',
 '비록',
 '더라도',
 '아니면',
 '만 못하다',
 '하는 편이 낫다',
 '불문하고',
 '향하여',
 '향해서',
 '향하다',
 '쪽으로',
 '틈타',
 '이용하여',
 '타다',
 '오르다',
 '제외하고',
 '이 외에',
 '이 밖에',
 '하여야',
 '비로소',
 '한다면 몰라도',
 '외에도',
 '이곳',
 '여기',
 '부터',
 '기점으로',
 '따라서',
 '할 생각이다',
 '하려고하다',
 '이리하여',
 '그리하여',
 '그렇게 함으로써',
 '하지만',
 '일때',
 '할때',
 '앞에서',
 '중에서',
 '보는데서',
 '으로써',
 '로써',
 '까지',
 '해야한다',
 '일것이다',
 '반드시',
 '할줄알다',
 '할수있다',
 '할수있어',
 '임에 틀림없다',
 '한다면',
 '등',
 '등등',
 '제',
 '겨우',
 '단지',
 '다만',
 '할뿐',
 '딩동',
 '댕그',
 '대해서',
 '대하여',
 '대하면',
 '훨씬',
 '얼마나',
 '얼마만큼',
 '얼마큼',
 '남짓',
 '여',
 '얼마간',
 '약간',
 '다소',
 '좀',
 '조

In [23]:
MAX_COUNT = 250
MIN_COUNT = 10

get_token_count = lambda conts : reduce(lambda x, y: x + y, [len(x.split()) for x in conts])


In [None]:
def split_by_max_token(sents):
    token_lens = [len(x.split()) for x in sents]
    sum = 0

    for idx, length in enumerate(token_lens):
        if sum + length > MAX_COUNT : 
            return idx, sents[:idx]
            break
        sum += length

    return len(sents), sents


In [24]:
def saveCSVFile(baseDir, media, article_dist):
    save_path = os.path.join(baseDir, media) + ".csv"

    article_dist.to_csv(save_path, mode='w', header=False)

In [25]:
def get_media_name(filepath):
    filename = filepath.split(os.sep)[-1]
    return filename.split(".")[0]

In [26]:
def is_small_text(lines):
    if not lines : return True
    return get_token_count(lines) < MIN_COUNT

In [27]:
if __name__ == '__main__':
    
    for idx, media_path in enumerate(iglob(os.path.join(ORIGIN_PATH, '**.csv'), recursive=False)):

        media_name = get_media_name(media_path)
        preprocessed_path = os.path.join(PREPROCESSED_PATH, media_name) + ".csv"
        print(media_name, preprocessed_path)
        
        f = open(media_path, 'r', newline="\n", encoding="utf-8")

        processed_dist = pd.DataFrame(columns=['Title', 'Contents'])
        summary_dist = pd.DataFrame(columns=['Title', 'Contents'])
        summary_proc_dist = pd.DataFrame(columns=['Title', 'Contents'])

        for [title, contents] in csv.reader(f):
            article = Article(title, media_name, contents.split("\t"))
            
            try:
                contents = list(article.readContent())

                clean_conts = preprocessor.del_personal_info(contents, media_name)
                clean_conts = preprocessor.cleanLines(clean_conts)
                clean_title = preprocessor.cleanLine(article.title)

                split_idx, clean_conts = split_by_max_token(clean_conts)
                split_conts = contents[:split_idx]

                if is_small_text(clean_conts): continue

                # Article Summary
                conts_line = " ".join(split_conts)
                summary_lines = summarize(conts_line, ratio=0.2, split=True)
                    
                summary= {'Title' : article.title, 'Contents' : '\t'.join(summary_lines) }
                summary_dist = summary_dist.append(summary, ignore_index=True)
                    
                # Article Preprocessed Summary
                summary_proc_list = preprocessor.del_personal_info(summary_lines, media_name)
                summary_proc_list = preprocessor.cleanLines(summary_proc_list)
                    
                summary_proc= {'Title' : clean_title, 'Contents' : '\t'.join(summary_proc_list) }
                summary_proc_dist = summary_proc_dist.append(summary_proc, ignore_index=True)
                    
                # Preprocessed Article
                proc = {'Title' : clean_title, 'Contents' : '\t'.join(clean_conts)}
                processed_dist = processed_dist.append(proc, ignore_index=True)
                    
            except Exception as err:
                print(err)
                print("Drop Article : {title}".format(title=article.title))
                pass
                
        saveCSVFile(PREPROCESSED_PATH, media_name, processed_dist)
        saveCSVFile(SUMMARY_PREPROCESSED_PATH, media_name, summary_proc_dist)
        saveCSVFile(SUMMARY_PATH, media_name, summary_dist)
        f.close()

세계일보 /data/ksb/TestDir/articles/Preprocessed-Data/세계일보.csv
KBS /data/ksb/TestDir/articles/Preprocessed-Data/KBS.csv
input must have more than one sentence
Drop Article : 2, [사사건건 플러스] 동급생 간 성폭력 피해자 숨져…사건 전말은?
input must have more than one sentence
Drop Article : 0, [여의도 사사건건] ‘중대재해’ 처벌…정치권 합종연횡?
input must have more than one sentence
Drop Article : 0, [여의도사사건건]‘공무원 피격’ 권성동 “北에 구조 요청했어야” 정청래 “남북관계 회복돼야”
input must have more than one sentence
Drop Article : 0, [여의도 사사건건] 윤석열 정직 2개월…“비위 엄중”·“비상식적”
input must have more than one sentence
Drop Article : 0, [여의도 사사건건] 21대 국회 첫 국정감사…‘공무원 피살’·‘추미애 아들’ 등 쟁점
input must have more than one sentence
Drop Article : 0, [사사건건 플러스] 사망자 유족 폭력 자제 호소 분노 잦아들까
input must have more than one sentence
Drop Article : 1, [사사건건 플러스] 사망자 유족 폭력 자제 호소 분노 잦아들까
input must have more than one sentence
Drop Article : 0, [여의도 사사건건] 국민의힘 권성동 “개천절 집회 자제해야”
input must have more than one sentence
Drop Article : 0, [여의도 사사건건] 18개 상임위원장 모두 민주당이? 그 결과는…
input must have more than 

input must have more than one sentence
Drop Article : 10, [미리보는 이데일리신문] 11년 만에 또 생사기로, 쌍용차 법정관리 신청
input must have more than one sentence
Drop Article : 1, [미리보는 이데일리 신문]극단적 정쟁에 막혀… 청년정치 설 땅이 없다
input must have more than one sentence
Drop Article : 2, [미리보는 이데일리 신문]극단적 정쟁에 막혀… 청년정치 설 땅이 없다
input must have more than one sentence
Drop Article : 3, [미리보는 이데일리 신문]극단적 정쟁에 막혀… 청년정치 설 땅이 없다
input must have more than one sentence
Drop Article : 5, [미리보는 이데일리 신문]극단적 정쟁에 막혀… 청년정치 설 땅이 없다
input must have more than one sentence
Drop Article : 6, [미리보는 이데일리 신문]극단적 정쟁에 막혀… 청년정치 설 땅이 없다
input must have more than one sentence
Drop Article : 1, [오늘의 국감일정]‘독감백신 상온노출’ 신성약품 대표 증인 출석
input must have more than one sentence
Drop Article : 1, 이데일리 "내일의 경제일정"-5월 한은 금융통화위원회
input must have more than one sentence
Drop Article : 1, [미리보는 이데일리신문]4인가족 중심 인구정책 ‘인구쇼크’ 못 막아
input must have more than one sentence
Drop Article : 2, [미리보는 이데일리신문]4인가족 중심 인구정책 ‘인구쇼크’ 못 막아
input must have more than one sentence
Drop Article

디지털타임스 /data/ksb/TestDir/articles/Preprocessed-Data/디지털타임스.csv
마이데일리 /data/ksb/TestDir/articles/Preprocessed-Data/마이데일리.csv
매경이코노미 /data/ksb/TestDir/articles/Preprocessed-Data/매경이코노미.csv
씨네21 /data/ksb/TestDir/articles/Preprocessed-Data/씨네21.csv
아시아경제 /data/ksb/TestDir/articles/Preprocessed-Data/아시아경제.csv
아이뉴스24 /data/ksb/TestDir/articles/Preprocessed-Data/아이뉴스24.csv
앳스타일 /data/ksb/TestDir/articles/Preprocessed-Data/앳스타일.csv
매일경제 /data/ksb/TestDir/articles/Preprocessed-Data/매일경제.csv
input must have more than one sentence
Drop Article : 8, 영구미제로 남은 O. J. 심슨 사건을 아시나요?
매일신문 /data/ksb/TestDir/articles/Preprocessed-Data/매일신문.csv
input must have more than one sentence
Drop Article : 4, 대구경북의 미래, 행정통합에서 길을 찾다
input must have more than one sentence
Drop Article : 2, 대구시 "2주간 유흥시설 내 집합금지 행정명령 발동"(전문)
머니S /data/ksb/TestDir/articles/Preprocessed-Data/머니S.csv
머니투데이 /data/ksb/TestDir/articles/Preprocessed-Data/머니투데이.csv
input must have more than one sentence
Drop Article : 1, 경기도, 20일부터 수원역에 불공정 행위