# 학습 데이터 전처리기

각 크롤러에서 얻은 데이터를 기반으로 

1. 텍스트 클리닝
2. mecab 품사태깅
3. Pororo 개체명 인식

의 순서로 전처리와 태깅 진행

## 패키지 설치 및 Import

전처리 및 데이터 생성기
* KoNLpy: Mecab 패키지
* langid: 언어탐지 패키지
* Pororo: PyTorch 버전과 Split_Sentence 생략을 위해 별도 수정버전 설치

In [None]:
!pip install langid jamo pandas

In [None]:
!pip install git+https://github.com/tsfo1489/pororo.git

In [None]:
import re
import json
import pickle
import langid
import pandas as pd
from jamo import j2h
from pororo import Pororo
from tqdm.notebook import tqdm

## Mecab 설치 및 사용자 사전 적용

In [None]:
!bash ./user-dic/mecab_install.sh
!bash ./user-dic/apply.sh

## 데이터 불러오기

크롤링 데이터를 각 문장별로 분할한 데이터

원본데이터는 Json List로 구성되어 있으며 반드시 'body'를 가지고 있어야함

기존 원본 데이터는 100MB를 넘어 Github에 업로드 불가

`
[
    {
        ...
        'body': '예시 문장 1'
        ...
    },
    {
        ...
        'body': '예시 문장 2'
        ...
    }
]
`

### 뉴스 데이터

In [None]:
crawling_data_df = pd.read_json('./RawData/news_keyword.json', dtype={'data_id': str, 'body': str, 'keyword': str})
crawling_data_df

### 트위터 데이터

In [None]:
crawling_data_df = pd.read_json('./RawData/twitter_cleaning.json').dropna()
crawling_data_df

### 유튜브 데이터

In [None]:
crawling_data_df = pd.read_json('./RawData/Youtube_Ko_Raw.json', dtype={'data_id': str, 'body': str, 'keyword': str})
crawling_data_df

## 텍스트 클리닝

#### 텍스트 클리너

In [None]:
class TextCleaner():
    '''
    Text Cleaner
    
    Remove Noise by regular expression and extract special tokens
    Check language and Correct spelling and spacing
    
    Attributes:
        cfg (List[Dict[str: str]]): Cleaning Rule List
        spacing_model (soyspacing.countbase.CountSpace): Spacing correction model
    
    methods
     * load_spacing_model: load soyspacing model
     * get_lang: get language code of sentence
     * remove_stopwords: 
     * cleaning_data: 
    '''
    def __init__(
                 self, 
                 path="./cleaning/CleaningRule.cfg", 
                 verbose=False):
        '''
        Text Cleaner initilizer
        
        Args:
            path (str, default: './cleaning/CleaningRule.cfg'): 
                Cleaning rule cfg file path
                Cleaning Rule File: JSON format
                List[Dict('name': str, 'type': str, 'regex': str, 'target': str)]
                * name (str): rule name
                * type (str): 
                    * "sub": replace "regex" to "target"
                    * "extract": extract "regex" to ret_dict[target]
                * regex (str): Regular expression of step
                * target (str): Target of cleaning Rule
                
                ex) Cleaning 'Very Funny, 😀😀😀'
                    {
                        'name': 'Remove Emoji',
                        'type': 'extract',
                        'regex': '[\\u2700-\\u27BF]|[\\u2011-\\u26FF]|[\\u2900-\\u2b59]'
                        'target': 'emoji'
                    }
                    
                    Method cleaning_data returns
                    ('True', 'Very Funny,', {'emoji': ['😀']}
            verbose (boolean, default: True): Enable debug mode
                print debug log to stdout
        '''
        with open(path, encoding='utf-8') as jsonfile:
            self.cfg = json.load(jsonfile)
        if verbose:
            for rule in self.cfg:
                print(rule)
        self.spacing_model = None
        
    def get_lang(self, sentence):
        '''
        Get language id from sentence
        
        Args:
            sentence (str): Input sentence
        
        Returns:
            language_id (str): ISO Language id 
                ex)'ko', 'en'
        '''
        lang = langid.classify(sentence)
        return lang[0]
    
    def cleaning_data(self, sent: str, verbose=False):
        '''
        Cleaning Sentence
        
        Args:
            sent (str): target sentence
            verbose (bool, default: False): Enable debug log print to stdout
        
        Returns: 
            It has two different return type
                True , sentence , ret_dict : 
                    if cleaning success and not filtered by sum options,
                    sentence (str): clean sentence 
                    ret_dict (Dict[str: List[str]]): Extracted strings
                False, reason: 
                    reason (str): Filtered reason
        '''
        

        # Merge splitted ja, mo
        # ex) ㅇㅖㅅㅣ -> 예시
        splited_word = re.findall(r'[ㄱ-ㅎ][ㅏ-ㅣ]', sent)
        for k in range(len(splited_word)):
            if verbose:
                print(f'Merge Jamo: {splited_word[k]} -> {j2h(splited_word[k][0], splited_word[k][1])}')
            sent = re.sub(splited_word[k], j2h(splited_word[k][0], splited_word[k][1]), sent)

        ret_dict = {}
        # Cleaning Step
        for idx, step in enumerate(self.cfg):
            if verbose:
                print(f'Step #{idx + 1}: {step["name"]}')
            # mode 'sub' replace regex to target
            if step['type'] == 'sub':
                sent = re.sub(step['regex'], step['target'], sent)
            # mode 'extract' extract regex to ret_dict[target]
            if step['type'] == 'extract':
                if step['target'] not in ret_dict:
                    ret_dict[step['target']] = []
                ret_dict[step['target']] += re.findall(step['regex'], sent)
                if verbose:
                    print(f'Extract: {re.findall(step["regex"], sent)}')
                sent = re.sub(step['regex'], '', sent)
            if verbose:
                print('->{}'.format(sent))

        # Languages Detection
        # Split sentence and Detect language for each part
        # Only "ko", "ko" pair can pass
        sentence_len = len(sent)
        pivot = sentence_len // 2
        lang1 = self.get_lang(sent[:pivot])
        lang2 = self.get_lang(sent[pivot:])
        if verbose:
            print('Language Detection')
            print(f'Part 1\'s Language: {lang1}')
            print(f'Part 2\'s Language: {lang2}')
        ret_dict['lang'] = [lang1, lang2]
        if ret_dict['lang'][0] != 'ko' or ret_dict['lang'][1] != 'ko':
            return False, f'Filter by Language: {lang1} {lang2} | {sent}', {}

        return True, sent, ret_dict

#### 전처리 및 언어 필터링

In [None]:
cleaner = TextCleaner('./cleaning/CleaningRule.cfg')
crawling_data_df['spacing'] = [None for _ in range(len(crawling_data_df))]
crawling_data_df['desc'] = [None for _ in range(len(crawling_data_df))]
correct_sent = []
error_sent = [] 
for idx, row in tqdm(crawling_data_df.iterrows(), total=len(crawling_data_df), desc='Cleaning'):
    flag, x, _ = cleaner.cleaning_data(row['body'])
    row['spacing'] = flag
    row['desc'] = x
    if flag:
        correct_sent.append(x)
    else:
        error_sent.append(x)
print(len(correct_sent), len(error_sent))

#### 필터링된 문장을 pd.DataFrame으로 변환

In [None]:
crawling_data_df = pd.DataFrame(correct_sent, columns=['body'])
crawling_data_df

## Mecab과 Pororo를 이용해 크롤링 통합 데이터 태깅

In [None]:
class DataMaker:
    def __init__(self, device='cuda:0'):
        self.pos_tagger = Pororo('pos', 'ko', device=device)
        self.ner_tagger = Pororo('ner', 'ko', device=device)
        self.dp_tagger = Pororo('dep_parse', 'ko', device=device)
        self.NER_Transform = {
            'PERSON': 'PER',
            'LOCATION': 'LOC',
            'CITY': 'LOC',
            'COUNTRY': 'LOC',
            'ORGANIZATION': 'ORG',
            'ARTIFACT': 'AFW',
            'DATE': 'DAT',
            'TIME': 'TIME',
            'CIVILIZATION': 'O',
            'ANIMAL': 'O',
            'PLANT': 'O',
            'QUANTITY': 'O',
            'STUDY_FIELD': 'FLD',
            'THEORY': 'O',
            'EVENT': 'EVT',
            'MATERIAL': 'O',
            'TERM': 'O',
            'OCCUPATION': 'O',
            'DISEASE': 'O',
            'O': 'O'
        }
    
    def parse(self, text):
        result_str = f'#RawSentence: {text}\n'
        row = {'body': text}
        row['pos'] = self.pos_tagger(text, return_surface=True) # 품사태깅
        row['ner'] = self.ner_tagger(text)#개체명 태깅
        row['dp'] = self.dp_tagger(text)# 디팬던시 파싱
        # print(row['pos'])
        # print(row['ner'])
        # print(row['dp'])
        ner_idx = 0
        ner_bio = True
        ner_str = row['ner'][0][0]
        e_id = 1
        eojeol_str = row['dp'][0][3]
        json_data = {'RawSentence': text, 'data': []}
        for idx in range(len(row['pos'])): # 품사기준으로 제일작은거로 쪼갬
            ner_tag = self.NER_Transform[row["ner"][ner_idx][1]]
            if row['pos'][idx][1] != 'SPACE':
                # 어절 ID, 형태소 ID
                result_str += f'{e_id}\t{idx + 2 - e_id}\t'
                morph = {}
                morph['e_id'] = e_id
                morph['m_id'] = idx + 2 - e_id
                # 형태소
                result_str += f'{row["pos"][idx][0]}\t'
                morph['morph'] = row["pos"][idx][0]
                # 품사
                result_str += f'{row["pos"][idx][1]}\t'
                morph['pos'] = row["pos"][idx][1]
                # NER
                result_str += f'{ner_tag}'
                morph['ner'] = ner_tag
                if ner_tag != 'O':
                    result_str += '-B\t' if ner_bio else '-I\t'
                    morph['ner'] += '-B' if ner_bio else '-I'
                    if ner_bio:
                        ner_bio = False
                else:
                    result_str += '\t'
                # DP
                if idx + 1 < len(row['pos']) and row['pos'][idx + 1][1] == 'SPACE':
                    result_str += f'{row["dp"][e_id - 1][2]}\t'
                    morph['dp'] = row["dp"][e_id - 1][2]
                    result_str += f'{eojeol_str}\t'
                    morph['gr'] = eojeol_str
                else:
                    result_str += f'{row["dp"][e_id - 1][2]}\t'
                    morph['dp'] = row["dp"][e_id - 1][2]
                    if '_' in eojeol_str:
                        result_str += f'{eojeol_str[:eojeol_str.find("_")]}\t'
                        morph['gr'] = eojeol_str[:eojeol_str.find("_")]
                    else:
                        result_str += f'{eojeol_str}\t'
                        morph['gr'] = eojeol_str
                result_str += '\n'
                json_data['data'].append(morph)

            ner_str = ner_str[len(row['pos'][idx][0]):]
            if len(ner_str) == 0 and idx + 1 < len(row['pos']):
                ner_idx += 1
                ner_str = row['ner'][ner_idx][0]
                ner_bio = True
            if row['pos'][idx][0] == ' ' and row['pos'][idx][1] == "SPACE":
                e_id += 1
                eojeol_str = row['dp'][e_id - 1][3]
        return result_str, json_data

DM = DataMaker()

#### Pororo 실행시 에러나는 데이터 필터링

In [None]:
import pickle
filtered = [] # 성공적 파싱 저장소
filtered_json = []
error = [] # 에러난거 저장소
# 문장을 하나씩 넣어서 파싱 진행
for _, row in tqdm(crawling_data_df.iterrows(), total=len(crawling_data_df)):
    try:
        filtered_data = DM.parse(row['body'])
        filtered_json.append(filtered_data[1])
        filtered.append(filtered_data)
    except KeyboardInterrupt:
        break
    except RuntimeError as e:
        error.append((e, row['body']))
        pass
    except ValueError as e:
        error.append((e, row['body']))
        pass
    except IndexError as e:
        error.append((e, row['body']))
        pass
# NER 생성용
pickle.dump(filtered, open('./output/filtered.pkl', 'wb'))
pickle.dump(error, open('./output/error.pkl', 'wb'))
# ABSA 생성용
json.dump(filtered_json, open('./absa_data/aeop/aeop_list.json'), ensure_ascii=False)