# ABSA 학습 데이터 생성

## 기본 설정

In [None]:
import re
import ast
import json
import random
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
dir = './absa_data/aeop'
csv_dir = './absa_data/csv'
train_dir = './absa_dataset'
tag_dict = {
    "가창력": "SAB",
    "기타": "ETC",
    "뷰티": "BTY",
    "몸매": "FIG",
    "반응": "REA",
    "분위기": "VIB",
    "사진": "PIC",
    "사회성": "SOC",
    "안무": "CHR",
    "앨범": "ALB",
    "얼굴": "FAC",
    "연기력": "ACT",
    "음악": "MSC",
    "이벤트": "EVT",
    "작품성": "ART",
    "팀워크": "TMW",
    "패션": "FSH",
    "퍼포먼스": "PER",
    "포즈": "POS",
    "표정": "EXP",
}

keyword_dict = {
    "SAB": "가창력",
    "ETC": "기타",
    "BTY": "뷰티",
    "FIG": "몸매",
    "REA": "반응",
    "VIB": "분위기",
    "PIC": "사진",
    "SOC": "사회성",
    "CHR": "안무",
    "ALB": "앨범",
    "FAC": "얼굴",
    "ACT": "연기력",
    "MSC": "음악",
    "EVT": "이벤트",
    "ART": "작품성",
    "TMW": "팀워크",
    "FSH": "패션",
    "PER": "퍼포먼스",
    "POS": "포즈",
    "EXP": "표정",
}

## VP NP쌍 추출

In [None]:
#-*-coding:utf-8-*-
# requirement: 
def extract_aspect():
    mode_list = ["vpnp","npvp"]
    
    with open(f'{dir}/pair_count_vpnp.json', 'r', encoding='utf-8') as jsonfile:
        VP_NP_list = json.load(jsonfile) 
    with open(f'{dir}/pair_count_npvp.json', 'r', encoding='utf-8') as jsonfile:
        NP_VP_list = json.load(jsonfile)
    pos_pd = pd.read_csv(f'{csv_dir}/vp_pos.csv', encoding = 'utf-8')
    neg_pd = pd.read_csv(f'{csv_dir}/vp_neg.csv', encoding = 'utf-8')
    neu_pd = pd.read_csv(f'{csv_dir}/vp_neu.csv', encoding = 'utf-8')
    
    for mode in mode_list:
        pos_dict = dict()
        neg_dict = dict()
        neu_dict = dict()
        aspect_dict =dict()
        ret_list = list()
        ao_list = []
        if mode == 'vpnp':
            ao_list = VP_NP_list
            vp_idx = 0
            np_idx = -1
        else:
            ao_list = NP_VP_list
            vp_idx = -1
            np_idx = 0
        if ao_list == []:
            exit()
        for i in range(len(pos_pd['VP'])):
            pos_dict[pos_pd['VP'][i]] = pos_pd['polarity']
        for i in range(len(neg_pd['VP'])):
            neg_dict[neg_pd['VP'][i]] = neg_pd['polarity']
        for i in range(len(neu_pd['VP'])):
            neu_dict[neu_pd['VP'][i]] = neu_pd['polarity']
            
        for i in tqdm(range(len(ao_list)), desc=mode):
            ret_dict = dict()
            pair_list = ao_list[i][0].split("_")

            if len(pair_list) != 1:
                if pair_list[vp_idx] in pos_dict:
                    ret_dict['pair'] = ao_list[i][0]
                    ret_dict['cnt'] = ao_list[i][1]
                    ret_dict['polarity'] = 1
                    ret_list.append(ret_dict)
                    if pair_list[np_idx] in aspect_dict:
                        aspect_dict[pair_list[np_idx]].append(ao_list[i][0])
                    else:
                        aspect_dict[pair_list[np_idx]] = [ao_list[i][0]]
                elif pair_list[vp_idx] in neg_dict:
                    ret_dict['pair'] = ao_list[i][0]
                    ret_dict['cnt'] = ao_list[i][1]
                    ret_dict['polarity'] = -1
                    ret_list.append(ret_dict)
                    if pair_list[np_idx] in aspect_dict:
                        aspect_dict[pair_list[np_idx]].append(ao_list[i][0])
                    else:
                        aspect_dict[pair_list[np_idx]] = [ao_list[i][0]]
                elif pair_list[vp_idx] in neu_dict:
                    ret_dict['pair'] = ao_list[i][0]
                    ret_dict['cnt'] = ao_list[i][1]
                    ret_dict['polarity'] = 0
                    ret_list.append(ret_dict)
                    if pair_list[np_idx] in aspect_dict:
                        aspect_dict[pair_list[np_idx]].append(ao_list[i][0])
                    else:
                        aspect_dict[pair_list[np_idx]] = [ao_list[i][0]]
            
            result_list = list()
            for key in aspect_dict.keys():
                result_dict = dict()
                result_dict['NP'] = key
                result_dict['pair'] = aspect_dict[key]
                result_list.append(result_dict)
        aspect_dataframe = pd.DataFrame(result_list)
        ret_dataframe = pd.DataFrame(ret_list)
        aspect_dataframe.to_csv(f'{csv_dir}/extract_aspect_{mode}.csv',sep=',')
        ret_dataframe.to_csv(f'{csv_dir}/extract_pair_{mode}.csv',sep=',')

In [None]:
extract_aspect()

## Aspect Opinion쌍 생성

In [None]:
#-*-coding:utf-8-*-
def pairing_aspect():
    #,NP,pair
    aspect_file_vpnp = 'extract_aspect_vpnp'
    aspect_file_npvp = 'extract_aspect_npvp'
    pair_file_vpnp = 'extract_pair_vpnp'
    pair_file_npvp = 'extract_pair_npvp'
    aspect_vpnp = pd.read_csv(f'{csv_dir}/{aspect_file_vpnp}.csv', encoding = 'utf-8')
    aspect_npvp = pd.read_csv(f'{csv_dir}/{aspect_file_npvp}.csv', encoding = 'utf-8')
    pair_vpnp = pd.read_csv(f'{csv_dir}/{pair_file_vpnp}.csv', encoding = 'utf-8')
    pair_npvp = pd.read_csv(f'{csv_dir}/{pair_file_npvp}.csv', encoding = 'utf-8')

    #pair to dictionary
    pair_dict = dict()
    for dep_pair in [pair_npvp, pair_vpnp]:
        for i in range(len(dep_pair)):
            pair = dep_pair['pair'][i]
            cnt = dep_pair['cnt'][i]
            polarity = dep_pair['polarity'][i]
            if pair not in pair_dict:
                pair_dict[pair] = {"cnt":cnt, "polarity":polarity} 

    result_list = list()
    
    for i in tqdm(range(len(aspect_vpnp['NP'])), desc='vp_np pairing'):
        ret_dict = dict()
        ret_dict['NP'] = aspect_vpnp['NP'][i]
        ret_dict['count'] = 0
        ret_dict['polarity'] = []
        ret_dict['pair'] = []
        ret_dict['order'] = 'vpnp'
        try:
            pair_list = ast.literal_eval(aspect_vpnp['pair'][i])
            for pair in pair_list:
                if pair[0] == '없': #없는 이 앞에 오는 건 ban
                    continue
                if pair in pair_dict:
                    ret_dict['count'] += pair_dict[pair]['cnt']
                    ret_dict['polarity'].append(pair_dict[pair]['polarity'])
                    ret_dict['pair'].append(pair)
        except Exception as e:
            print(e)
            pass
        if ret_dict['count'] > 0:
            result_list.append(ret_dict)
    
    for i in tqdm(range(len(aspect_npvp['NP'])), desc='np_vp pairing'):
        ret_dict = dict()
        ret_dict['NP'] = aspect_npvp['NP'][i]
        ret_dict['count'] = 0
        ret_dict['polarity'] = []
        ret_dict['pair'] = []
        ret_dict['order'] = 'npvp'
        try:
            pair_list = ast.literal_eval(aspect_npvp['pair'][i])
            for pair in pair_list:
                ret_dict['count'] += pair_dict[pair]['cnt']
                ret_dict['polarity'].append(pair_dict[pair]['polarity'])
                ret_dict['pair'].append(pair)
        except Exception as e:
            print(e)
            pass
        if ret_dict['count'] > 0:
            result_list.append(ret_dict)
    result_sorted_list = sorted(result_list, key=lambda pair_dict: pair_dict['count'], reverse=True)        
    result_dataframe = pd.DataFrame(result_sorted_list)
    result_dataframe.to_csv(f'{csv_dir}/aspect_pair.csv',sep=',')
    

In [None]:
pairing_aspect()

## Aspect 및 Opinion 태깅

In [None]:
def match_tag_aspect():
    # aspect의 관계쌍 파일 불러오기
    aspect_pair_file = 'aspect_pair'
    pair_tag_file = 'aspect_group'
    aspect_pair = pd.read_csv(f'{csv_dir}/{aspect_pair_file}.csv', encoding = 'utf-8')
    pair_tag = pd.read_csv(f'{csv_dir}/{pair_tag_file}.csv', encoding = 'utf-8')
    
    #np key로 딕셔너리에 저장
    ret_dict = dict()
    for i in range(len(pair_tag['NP'])):
        if pair_tag['tag'][i] == 'o':
            np = pair_tag['NP'][i]
            pair = pair_tag['pair'][i]
            tag = pair_tag['tag'][i]
            group = pair_tag['group'][i]
            ret_dict[np] = [np, pair, tag, group]
    
    # ,NP,count,polarity,pair
    aspect_pair_tag_list = list()
    except_pair_tag_list = list()
    for i in tqdm(range(len(aspect_pair['NP'])), desc='match tag aspect'):
        if aspect_pair['NP'][i] in ret_dict:
            #학습데이터 만들기 위한 aspect-group-polarity-pair 데이터 만들기
            aspect_pair_tag = dict()
            aspect_pair_tag['NP'] = aspect_pair['NP'][i]
            aspect_pair_tag['tag'] = ret_dict[aspect_pair['NP'][i]][2]
            aspect_pair_tag['group'] = ret_dict[aspect_pair['NP'][i]][3]
            aspect_pair_tag['count'] = aspect_pair['count'][i]
            aspect_pair_tag['polarity'] = aspect_pair['polarity'][i]
            aspect_pair_tag['pair'] = aspect_pair['pair'][i]
            aspect_pair_tag_list.append(aspect_pair_tag)
        else:
            #확인용
            except_pair_tag = dict()
            except_pair_tag['NP'] = aspect_pair['NP'][i]
            except_pair_tag['count'] = aspect_pair['count'][i]
            except_pair_tag['polarity'] = aspect_pair['polarity'][i]
            except_pair_tag['pair'] = aspect_pair['pair'][i]
            except_pair_tag_list.append(except_pair_tag)
    tag_dataframe = pd.DataFrame(aspect_pair_tag_list)
    except_dataframe = pd.DataFrame(except_pair_tag_list)
    tag_dataframe.to_csv(f'{csv_dir}/match_tag_aspect.csv',sep=',')
    except_dataframe.to_csv(f'{csv_dir}/except_tag_aspect.csv',sep=',')

In [None]:
match_tag_aspect()

## ABSA 학습 데이터 생성

In [None]:
# RawSentence랑 aeop 태깅된 데이터 불러오기
class bert_train():
    def __init__(self):
        print('Load AEOP List')
        self.dp_aeop = pd.read_json(f'{dir}/aeop_list.json', encoding = 'utf-8')
        print('Load AEOP List')
        train_list = self.make_train()
        aeop_tag_train_list, other_train_list = self.arrange_train_list(train_list)
        train_list = self.merge_train_data(aeop_tag_train_list, other_train_list, 0)
        total_size = len(train_list)
        print("Total Dataset Size:", len(train_list))
        train_size = total_size * 9 //10
        print("Train Dataset Size:", len(train_list) - train_size)
        print("Test Dataset Size:", train_size)
        self.write_tsv(train_list[:train_size], "train")
        self.write_tsv(train_list[train_size:], "test")

    def make_group_dict(self):
        # tag group이 매치된 데이터 불러오기 
        match_tag= pd.read_csv(f'{csv_dir}/match_tag_aspect.csv', encoding = 'utf-8')
        # group에 대해 dictionary 만들기: group_dict
        group_dict = dict()
        for i in tqdm(range(len(match_tag)), desc='make group dict'):
            group_list = ast.literal_eval(match_tag['group'][i])
            pair_list = ast.literal_eval(match_tag['pair'][i])
            if len(group_list) == 1:
                np = match_tag['NP'][i]
                if np in group_dict:
                    group_dict[np]['pair'] += pair_list
                else:
                    group_dict[np] = {"tag": tag_dict[group_list[0]],
                                    "pair": pair_list}
        return group_dict

    def write_tsv(self, line_list, filename):
        with open(f'{train_dir}/{filename}.tsv', 'w', encoding='utf-8') as file: 
            for line in line_list: 
                file.write(line)
                
    def compare_init_target(self, target, morph):
        flag_all = True
        target_split = target.split(" ")
        morph_split = morph.split(" ")
        #띄어쓰기 개수 다르면 false
        if len(morph_split) != len(target_split):
            flag_all = False
        elif len(morph_split) > 1:
            #  두 어절 이상이면 첫 어절이 같지 않으면 false 
            for i in range(len(morph_split)-1):
                if(morph_split[i] != target_split[i]):
                    flag_all = False
        return flag_all

    def arrange_train_list(self, train_list):
        d_count = 0 # 중복값
        nk_count = 0 # 한국어 아닌 값
        t_count = 0 # 태깅 이상한 값
        nTrain = len(train_list)
        train_list_sorted = sorted(train_list, key=str.lower)
        new_train_list = list()
        new_other_list = list()
        before_line = ""
        for line in tqdm(train_list_sorted, desc='arrange train list'):
            segments = line.split(" ")
            english = re.compile('[a-zA-Z0-9]+')
            isNotKo = english.match(segments[0])
            if isNotKo:
                nk_count += 1
            elif line == before_line:
                d_count += 1
            else:
                temp = line.split('\n')
                split_line = temp[0].split('\t')
                words_list = split_line[0].split(" ")
                aspect_tags = split_line[1]
                tags_list = aspect_tags.split(" ")
                if len(words_list) != len(tags_list):
                    t_count += 1
                    continue
                
                find_tag = False
                for tag in tags_list:
                    if tag != 'O':
                        find_tag = True
                if find_tag:
                    new_train_list.append(line)
                else :
                    new_other_list.append(line)
                before_line = line
        nTrainResult = len(new_train_list)
        nOtherResult = len(new_other_list)
        print(f"한국어 아닌 데이터 {nk_count}개, 중복 데이터 {d_count}개, 태그 개수 안맞는 {t_count}개 제거")
        print(f"{nTrain}개 학습데이터 중 {nTrainResult}개 AEOP 데이터, {nOtherResult}개 other 데이터 확보")
        
        return new_train_list, new_other_list

    def merge_train_data(self, tag_list, other_list, per = 20):
        tag_size = len(tag_list)
        other_size = tag_size * per // 100
        train_list = tag_list + other_list[:other_size]
        random.shuffle(train_list)
        return train_list

    def make_train(self):
        train_list = list()
        nSentence = len(self.dp_aeop['RawSentence'])
        group_dict = self.make_group_dict()
        for i in tqdm(range(nSentence), desc='make_train'):
            nData = len(self.dp_aeop['data'][i])
            morph_list = self.dp_aeop['data'][i]
            
            total_list = list()
            for j in range(nData):
                morph = morph_list[j]['morph']
                word_split = morph.split(" ")
                    
                if morph_list[j]['aeop'] == "":
                    total_list.append([j,morph,'O'])
                
                elif morph_list[j]['aeop'][0] == 'A':
                    #aspect를 찾았으면 태깅 시작
                    target_ret = [j, morph, 'O']
                    for k in range(nData):
                        if morph_list[k]['aeop'] == "":
                            total_list.append([j,morph,'O'])
                            continue
                        elif morph_list[k]['aeop'][0] == 'O':
                            #opinion 찾았으면 쌍 있는지 탐색
                            replace_morph = morph
                            find_opinion = morph_list[k]['morph']
                            if morph in group_dict:
                                replace_morph = morph
                            else:
                                morph_split = morph.split(" ")
                                if len(morph_split[-1]) > 2:
                                    if morph[:-1] in group_dict:
                                        replace_morph = morph[:-1]
                                    else:
                                        continue
                                else:
                                    continue
                            morph_pair_list = group_dict[replace_morph]['pair']
                            
                            for pair in morph_pair_list:
                                pair_split = pair.split("_")
                                if pair_split[0] == replace_morph:
                                    #npvp의 경우
                                    vp_idx = 1
                                elif pair_split[1] == replace_morph:
                                    #vpnp의 경우
                                    vp_idx = 0
                                
                                vp = pair_split[vp_idx]
                                vp_size = len(vp)
                                vp_split = vp.split(" ")
                                opinion_split = find_opinion.split(" ")
                                if vp_size > len(find_opinion):
                                    continue
                                if vp == find_opinion:
                                    total_list.append([k, find_opinion, 'OPN'])
                                    target_ret = [j, morph, 'ASP-'+group_dict[replace_morph]['tag']]
                                elif (len(vp_split) == len(opinion_split)
                                and vp == find_opinion[:vp_size]):
                                    total_list.append([k, find_opinion, 'OPN'])
                                    target_ret = [j, morph, 'ASP-'+group_dict[replace_morph]['tag']]
                    total_list.append(target_ret)
                else:
                    total_list.append([j,morph,'O'])

            sentence_dict = dict()
            for total in total_list:
                if total[0] in sentence_dict:
                    if total[2] != 'O':
                        sentence_dict[total[0]] = {"word":total[1], "tag":total[2]}
                else:
                    sentence_dict[total[0]] = {"word":total[1], "tag":total[2]}
                    
            words_list = list()
            tags_list = list()
            for s_id in range(len(sentence_dict.keys())):
                word_split = sentence_dict[s_id]['word'].split(' ')
                tag = sentence_dict[s_id]['tag']
                word_cnt = 0
                for word in word_split:
                    if tag == 'O':
                        tags_list.append(tag)
                    elif word_cnt == 0:
                        tags_list.append(tag+'-B')
                    else:
                        tags_list.append(tag+'-I')
                    words_list.append(word)
                    word_cnt += 1
                    
            train_line = " ".join(words_list) + '\t' + " ".join(tags_list) + '\n'
            train_list.append(train_line)
            
        return train_list

In [None]:
bert_train()

# KoBERT-ABSA 학습

## Train & Eval

Parameters
|Param|Desc|
|:----:|----|
|model_dir|저장된 모델 디렉토리(absa_model)|
|data_dir|데이터셋 디렉토리(absa_dataset)|
|train_batch_size|(Default: 32)|
|eval_batch_size|(Default: 64)|
|num_train_epochs|학습 Epoch(Default: 20)|
|do_train|training 모드|
|do_eval|eval 모드, training 모드와 함께 사용시 1,000 step 마다 Eval|

In [None]:
!python main.py --data_dir=absa_dataset --model_dir=absa_model --do_train --do_eval --train_batch_size=16 --num_train_epochs=20

## Predict

Parameters
|Param|Desc|
|:----:|----|
|model_dir|저장된 모델 디렉토리(model)|
|input_file|Input 파일명|
|output_file|Output 파일명|
|batch_size|(Default: 32)|

In [None]:
!python predict.py --model_dir=absa_model --input_file ./absa_dataset/test.txt