# ABSA 학습 데이터 생성

## 기본 설정

In [None]:
import re
import ast
import json
import random
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
dir = './absa_data/aeop'
csv_dir = './absa_data/csv'
train_dir = './absa_dataset'
tag_dict = {
    "가창력": "SAB",
    "기타": "ETC",
    "뷰티": "BTY",
    "몸매": "FIG",
    "반응": "REA",
    "분위기": "VIB",
    "사진": "PIC",
    "사회성": "SOC",
    "안무": "CHR",
    "앨범": "ALB",
    "얼굴": "FAC",
    "연기력": "ACT",
    "음악": "MSC",
    "이벤트": "EVT",
    "작품성": "ART",
    "팀워크": "TMW",
    "패션": "FSH",
    "퍼포먼스": "PER",
    "포즈": "POS",
    "표정": "EXP",
}

keyword_dict = {
    "SAB": "가창력",
    "ETC": "기타",
    "BTY": "뷰티",
    "FIG": "몸매",
    "REA": "반응",
    "VIB": "분위기",
    "PIC": "사진",
    "SOC": "사회성",
    "CHR": "안무",
    "ALB": "앨범",
    "FAC": "얼굴",
    "ACT": "연기력",
    "MSC": "음악",
    "EVT": "이벤트",
    "ART": "작품성",
    "TMW": "팀워크",
    "FSH": "패션",
    "PER": "퍼포먼스",
    "POS": "포즈",
    "EXP": "표정",
}

## 어절 분할된 데이터셋에서 VP NP 묶음

In [None]:
from dp_sub_function import count_dict, make_dp_dict_num, join_morph, isAspect
def merge_morphs(dp_pd):
    dp_raw = dp_pd['RawSentence']
    dp_data = dp_pd['data']
    new_dp_data = list()
    cnt=0
    for list_dp in dp_data:
        # list_dp는 형태소별 데이터가 모인 문장데이터
        morph_size = len(list_dp)
        segment_count = 0
        skip_count = 0
        new_list_dp = list()
        for dict_dp in list_dp:
            if segment_count > skip_count:
                skip_count += 1
                continue
            SBJ_count_dict = dict()
            new_dict_dp = dict()
            e_id = dict_dp['e_id']
            m_id = dict_dp['m_id']
            morph = dict_dp['morph']
            pos = dict_dp['pos']
            ner = dict_dp['ner']
            dp = dict_dp['dp']
            gr = dict_dp['gr']
            over_mid = False
            if m_id >= morph_size:
                new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, [morph], [pos], [ner], dp, [gr])
                new_list_dp.append(new_dict_dp)
                continue
            if ner in ["PER-B","ORG-B","AFW-B"]:
                ner_pre = ner[:-1]
                complex_word = morph
                step = segment_count - skip_count
                last_e_id = e_id
                while list_dp[m_id+step]['ner'] == ner_pre+"I":
                    last_e_id = list_dp[m_id+step]['e_id']
                    complex_word = join_morph(list_dp[m_id+step-1]['e_id'], list_dp[m_id+step]['e_id'] ,complex_word, list_dp[m_id+step]['morph'])
                    #dictionary로 만듦
                    segment_count += 1
                    step = segment_count - skip_count
                    if m_id+step >= morph_size:
                        new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, [complex_word], ['NNP'], [ner], list_dp[m_id+step-1]['dp'], ["NP"])
                        over_mid = True
                        break
                complex_mid = [m_id]
                complex_morph = [complex_word]
                complex_pos = ['NNP']
                complex_ner = [ner]
                complex_gr = ["NP"]
                offset = segment_count -skip_count
                while not over_mid and list_dp[m_id+offset]['e_id'] == last_e_id:
                    if list_dp[m_id+offset]['ner'] in ["PER-B","ORG-B","AFW-B","FLD-B"]:
                        break
                    complex_ner.append(list_dp[m_id+offset]['ner'])
                    complex_mid.append(list_dp[m_id+offset]['m_id'])
                    complex_morph.append(list_dp[m_id+offset]['morph'])
                    complex_pos.append(list_dp[m_id+offset]['pos'])
                    complex_gr.append(list_dp[m_id+offset]['gr'])
                    offset += 1
                    segment_count += 1
                    if m_id+offset >= morph_size:
                        new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, complex_morph, complex_pos, complex_ner, list_dp[m_id+offset-1]['dp'], complex_gr)
                        over_mid = True
                        break
                if not over_mid:
                    new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, complex_morph, complex_pos, complex_ner, list_dp[m_id+offset-1]['dp'], complex_gr)
                    #new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, [complex_word], ['NNP'], [ner], dp, ["NP"])
                new_list_dp.append(new_dict_dp)
                SBJ_count_dict = count_dict(SBJ_count_dict, complex_word)
                continue
            elif ner == "FLD_B":
                ner_pre = ner[:-1]
                complex_word = morph
                step = segment_count - skip_count
                while list_dp[m_id+step]['ner'] == "FLD-I":
                    complex_word = join_morph(list_dp[m_id+step-1]['e_id'], list_dp[m_id+step]['e_id'] ,complex_word, list_dp[m_id+step]['morph'])
                    #dictionary로 만듦
                    segment_count += 1
                    step = segment_count - skip_count
                    if m_id+step >= morph_size:
                        new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, [complex_word], ['NNP'], [ner], list_dp[m_id+step-1]['dp'], ["NP"])
                        over_mid = True
                        break
                if not over_mid:
                    new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, [complex_word], ['NNP'], [ner], dp, ["NP"])
                new_list_dp.append(new_dict_dp)
                if isAspect(complex_word, pos, ner):
                    NP_count_dict = count_dict(NP_count_dict, complex_word)
                continue
            complex_mid = [m_id]
            complex_morph = [morph]
            complex_pos = [pos]
            complex_ner = [ner]
            complex_gr = [gr]
            offset = 0
            while list_dp[m_id+offset]['e_id'] == e_id:
                if list_dp[m_id+offset]['ner'] in ["PER-B","ORG-B","AFW-B","FLD-B"]:
                    break
                complex_ner.append(list_dp[m_id+offset]['ner'])
                complex_mid.append(list_dp[m_id+offset]['m_id'])
                complex_morph.append(list_dp[m_id+offset]['morph'])
                complex_pos.append(list_dp[m_id+offset]['pos'])
                complex_gr.append(list_dp[m_id+offset]['gr'])
                offset += 1
                segment_count += 1
                if m_id+offset >= morph_size:
                    new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, complex_morph, complex_pos, complex_ner, list_dp[m_id+offset-1]['dp'], complex_gr)
                    over_mid = True
                    break
            if not over_mid:
                new_dict_dp = make_dp_dict_num(e_id, m_id-skip_count, complex_morph, complex_pos, complex_ner, list_dp[m_id+offset-1]['dp'], complex_gr)
            
            new_list_dp.append(new_dict_dp)
        
        if cnt % 20000 == 1:
            print(dp_raw[cnt])
            print(cnt,"개 문장 분석...",round(cnt*100/len(dp_data),1),"%")
        new_dp_data.append({'RawSentence':dp_raw[cnt], 'data':new_list_dp})
        cnt += 1
    return new_dp_data

dp_all= pd.read_json('./output/dataset.json', encoding = 'utf-8')

new_dp = merge_morphs(dp_all)

#전체데이터 200K
with open('./absa_data/aeop/dp_merge.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(new_dp, ensure_ascii=False))

## Aspect, Opinion 후보군 NP, VP 추출

In [None]:
#NP, NP_MOD, VP, VP_MOD 대상으로 어근 중심 빈도 추출
from dp_sub_function import count_dict, make_dp_dict_num, make_dp_dict, join_morph, isAspect, isOpinion, check_ner, get_NP_word, get_VP_word

def pair_count_extend(dp_data, key):
    SBJ_count_dict = dict()
    NP_count_dict = dict() # store uni morph
    VP_count_dict = dict()
    single_word_count_dict = dict()
    mutual_word_count_dict = dict()
    new_dp_data = list()
    cnt = 0
    cnt_opinion = [0,0,0,0,0,0,0]
    cnt_aspect = [0,0,0,0,0,0,0,0]
    for list_dp in dp_data:
        cnt += 1
        new_list_dp = list()
        seg_size = len(list_dp)
        complex_count = 0
        skip_count = 0
        dict_cnt = 1
        for dict_dp in list_dp:
            ner_skip = False
            short_skip = False
            if complex_count > skip_count:
                skip_count += 1
                continue
            
            new_dict_dp = dict()
            e_id = dict_dp['e_id']
            m_id = dict_dp['m_id']
            dp = dict_dp['dp']
            
            morph_list = dict_dp['morph']
            pos_list = dict_dp['pos']
            ner_list = dict_dp['ner']
            
            ner_skip = check_ner(ner_list)
            gr_list = dict_dp['gr']
            merge_morph = "".join(morph_list)
            merge_pos = "+".join(pos_list)
            if len(morph_list) == 1:
                single_word_count_dict = count_dict(single_word_count_dict, merge_morph)
            if 'NP' in gr_list:
                single_np, temp = get_NP_word(merge_morph, gr_list, pos_list)
                if single_np != merge_morph:
                    single_word_count_dict = count_dict(single_word_count_dict, single_np)
            elif 'VP' in gr_list:
                single_vp, temp = get_VP_word(merge_morph,pos_list, gr_list)
                if single_vp != merge_morph:
                    single_word_count_dict = count_dict(single_word_count_dict, single_vp)
                
            if m_id >= seg_size or ner_skip:
                new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, pos_list[0], ner_list[0], dp, gr_list[0])
                dict_cnt += 1
                new_list_dp.append(new_dict_dp)
                continue
            #(E)
            if (gr_list[-1] == 'NP_AJT') and (pos_list[0][0] != "J") and ("+EC" not in pos_list[-1]):
                complex_word = merge_morph
                complex_pos = merge_pos
                ner_skip = check_ner(list_dp[m_id]['ner'])
                first_seg, isSplit = get_NP_word(morph_list, gr_list, pos_list, exclude_gr=["NP_SBJ", "NP_OBJ", "NP_AJT", "NP_CNJ", "NP_MOD"])
                if len(first_seg)<2:
                    short_skip = True
                if isAspect(first_seg, pos_list[0],ner_list[0]):
                    NP_count_dict = count_dict(NP_count_dict, first_seg)
                    cnt_aspect[0] += 1
                first_seg, isSplit = get_NP_word(morph_list, gr_list)
                if ner_skip or short_skip :
                    new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, "+".join(ner_list), dp, "+".join(gr_list))
                    dict_cnt += 1
                    new_list_dp.append(new_dict_dp)
                    continue
                elif dp == list_dp[m_id]['e_id'] and 'VP' in list_dp[m_id]['gr']:
                    if not isSplit : 
                        second_seg, isSplit = get_NP_word(list_dp[m_id]['morph'], list_dp[m_id]['gr'], list_dp[m_id]['pos'], exclude_gr=["NP_SBJ", "NP_OBJ", "NP_AJT", "NP_CNJ", "NP_MOD"])
                        complex_word = join_morph(e_id, list_dp[m_id]['e_id'], first_seg, second_seg)
                        complex_count += 1
                        if isOpinion(first_seg, pos_list[0]) and isOpinion(second_seg, pos_list[0]):
                            VP_count_dict = count_dict(VP_count_dict, complex_word)
                            cnt_opinion[0] += 1
                        complex_word = join_morph(e_id, list_dp[m_id]['e_id'], merge_morph, "".join(list_dp[m_id]['morph']))
                        new_dict_dp = make_dp_dict([e_id, list_dp[m_id]['e_id']], dict_cnt, complex_word, complex_pos, ner_list[0], list_dp[m_id]['dp'], "VP")
                        dict_cnt += 1
                        new_list_dp.append(new_dict_dp)
                    else:
                        new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, ner_list[0], dp, "VP")
                        dict_cnt += 1
                        new_list_dp.append(new_dict_dp)
                else:
                    new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, ner_list[0], dp, "VP")
                    dict_cnt += 1
                    new_list_dp.append(new_dict_dp)
                mutual_word_count_dict = count_dict(mutual_word_count_dict, merge_morph+"+"+"".join(list_dp[m_id]['morph']))
                
            # and ("J" not in pos_list[-1])
            # (B)연속한 명사구가 지배소 의존소로 구성
            elif gr_list[0] in ['NP', 'NP_MOD'] and "J" not in pos_list[0]  and ("+EC" not in pos_list[-1]):
                # 현재 형태소가 다음 단어를 의존하고 다음단어의 gr에 NP가 포함되었다면 (1)+(2)+(3)까지 조합가능
                complex_word = ""
                additional_word = ""
                first_seg = ""
                second_seg = ""
                third_seg = ""
                ## 1번째 글자
                first_seg, isSplit = get_NP_word(morph_list, gr_list, pos_list, exclude_gr=["NP_SBJ", "NP_OBJ", "NP_AJT", "NP_CNJ", "NP_MOD"])
                if len(first_seg) < 2:
                    short_skip = True
                if isAspect(first_seg, pos_list[0], ner_list[0]):
                    NP_count_dict = count_dict(NP_count_dict, first_seg) #(1)
                    cnt_aspect[1] += 1
                first_seg, istempSplit = get_NP_word(morph_list, gr_list)
                #마지막 글자 제거, 연결 불가
                # complex_count
                if not isSplit :
                    ## 2번째 글자   
                    ner_skip = check_ner(list_dp[m_id]['ner'])
                    if ner_skip or short_skip:
                        new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, "+".join(ner_list), dp, "+".join(gr_list))
                        dict_cnt +=  1
                        new_list_dp.append(new_dict_dp)
                        continue
                    elif (dp == list_dp[m_id]['e_id']) and ('NP' in list_dp[m_id]['gr']):
                        second_seg, isSplit = get_NP_word(list_dp[m_id]['morph'], list_dp[m_id]['gr'], list_dp[m_id]['pos'], exclude_gr=["NP_SBJ", "NP_OBJ", "NP_AJT", "NP_CNJ", "NP_MOD"])
                        if len(second_seg) < 2:
                            short_skip = True
                        if isAspect(second_seg, list_dp[m_id]['pos'][0], list_dp[m_id]['ner'][0]):
                            NP_count_dict = count_dict(NP_count_dict, second_seg) #(2)
                            cnt_aspect[2] += 1
                        additional_word = join_morph(e_id, list_dp[m_id]['e_id'], first_seg, second_seg)
                        if second_seg != "":
                            mutual_word_count_dict = count_dict(mutual_word_count_dict, first_seg +"+"+second_seg)
                        if isAspect(additional_word, list_dp[m_id]['pos'][0], list_dp[m_id]['ner'][0]) and not short_skip:
                            NP_count_dict = count_dict(NP_count_dict, additional_word) #(1)+(2) 
                            cnt_aspect[3] += 1
                        complex_count += 1
                        if not isSplit and m_id + 1 < seg_size:
                            ## 3번째 어절
                            ner_skip = check_ner(list_dp[m_id+1]['ner'])
                            if ner_skip or short_skip:
                                concat_morph = join_morph(e_id, list_dp[m_id]["e_id"], merge_morph, "".join(list_dp[m_id]["morph"]))
                                concat_pos = pos_list + list_dp[m_id]["pos"]
                                concat_ner = ner_list + list_dp[m_id]["ner"]
                                concat_gr = gr_list + list_dp[m_id]["gr"]
                                new_dict_dp = make_dp_dict([e_id, list_dp[m_id]["e_id"]], dict_cnt, concat_morph, "+".join(concat_pos), "+".join(concat_ner), list_dp[m_id]['dp'], "+".join(concat_gr))
                                dict_cnt += 1
                                new_list_dp.append(new_dict_dp)
                                continue
                            elif ('NP' in list_dp[m_id+1]['gr'] 
                            and list_dp[m_id]['dp'] == list_dp[m_id+1]['e_id'] ):
                                third_seg, isSplit = get_NP_word(list_dp[m_id+1]['morph'], list_dp[m_id+1]['gr'], list_dp[m_id+1]['pos'], exclude_gr=["NP_SBJ","NP_OBJ", "NP_AJT", "NP_CNJ","NP_MOD"])
                                if len(third_seg) < 2:
                                    short_skip = True
                                if not isSplit :
                                    #마지막 글자 제거
                                    if isAspect(third_seg, list_dp[m_id+1]['pos'][0], list_dp[m_id+1]['ner'][0]):
                                        NP_count_dict = count_dict(NP_count_dict, third_seg) #(3)
                                        cnt_aspect[4] += 1
                                additional_word = join_morph(list_dp[m_id]['e_id'], list_dp[m_id+1]['e_id'], second_seg, third_seg)
                                
                                complex_word = join_morph(e_id, list_dp[m_id]['e_id'], first_seg, additional_word)
                                if second_seg != "" and third_seg != "" :
                                    mutual_word_count_dict = count_dict(mutual_word_count_dict, second_seg +"+"+third_seg)
                                    mutual_word_count_dict = count_dict(mutual_word_count_dict, first_seg +"+"+second_seg +"+"+third_seg)
                                complex_count += 1
                                if isAspect(additional_word, list_dp[m_id]['pos'][0], list_dp[m_id]['ner'][0]) and not short_skip:
                                    NP_count_dict = count_dict(NP_count_dict, additional_word) #(2)+(3)
                                    cnt_aspect[5] += 1
                                if isAspect(complex_word, pos_list[0], ner_list[0]) and not short_skip :
                                    NP_count_dict = count_dict(NP_count_dict, complex_word) #(1)+(2)+(3)
                                    cnt_aspect[6] += 1
                                # 세 어절까지
                                concat_morph = join_morph(e_id, list_dp[m_id]["e_id"], merge_morph, "".join(list_dp[m_id]["morph"]))
                                concat_morph = join_morph(list_dp[m_id]["e_id"], list_dp[m_id+1]["e_id"], concat_morph, "".join(list_dp[m_id+1]["morph"]))
                                concat_pos = pos_list + list_dp[m_id]["pos"] + list_dp[m_id+1]["pos"]
                                concat_ner = ner_list + list_dp[m_id]["ner"] + ner_list + list_dp[m_id+1]["ner"]
                                concat_gr = gr_list + list_dp[m_id]["gr"] + list_dp[m_id+1]["gr"]
                                
                                new_dict_dp = make_dp_dict([e_id, list_dp[m_id]["e_id"], list_dp[m_id+1]["e_id"]], dict_cnt, concat_morph, "+".join(concat_pos), "+".join(concat_ner), list_dp[m_id+1]['dp'], "+".join(concat_gr))
                                dict_cnt += 1
                                new_list_dp.append(new_dict_dp)
                            else:
                                # 두 어절까지
                                if list_dp[m_id]["gr"][-1] in ["NP_SBJ","NP_OBJ", "NP_MOD"]:
                                    concat_morph = join_morph(e_id, list_dp[m_id]["e_id"], merge_morph, "".join(list_dp[m_id]["morph"][:-1]))
                                else:
                                    concat_morph = join_morph(e_id, list_dp[m_id]["e_id"], merge_morph, "".join(list_dp[m_id]["morph"]))
                                concat_pos = pos_list + list_dp[m_id]["pos"]
                                concat_ner = ner_list + list_dp[m_id]["ner"]
                                concat_gr = gr_list + list_dp[m_id]["gr"]
                                new_dict_dp = make_dp_dict([e_id, list_dp[m_id]["e_id"]], dict_cnt, concat_morph, "+".join(concat_pos), "+".join(concat_ner), list_dp[m_id]['dp'], "+".join(concat_gr))
                                dict_cnt += 1
                                new_list_dp.append(new_dict_dp)
                        else:
                            concat_morph = join_morph(e_id, list_dp[m_id]["e_id"], merge_morph, "".join(list_dp[m_id]["morph"]))
                            concat_pos = pos_list + list_dp[m_id]["pos"]
                            concat_ner = ner_list + list_dp[m_id]["ner"]
                            concat_gr = gr_list + list_dp[m_id]["gr"]
                            new_dict_dp = make_dp_dict([e_id, list_dp[m_id]["e_id"]], dict_cnt, concat_morph, "+".join(concat_pos), "+".join(concat_ner), dp, "+".join(concat_gr))
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                    else:
                        new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, "+".join(ner_list), dp, "+".join(gr_list))
                        dict_cnt += 1
                        new_list_dp.append(new_dict_dp)
                else:
                    new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, "+".join(ner_list), dp, "+".join(gr_list))
                    dict_cnt += 1
                    new_list_dp.append(new_dict_dp)
            # (C)복합 동사구와 (D)부사어구+용언구
            elif 'VP' in gr_list and "J" != pos_list[0][0]:
                first_seg, isSplit = get_VP_word(morph_list, pos_list, gr_list)
                if isOpinion(first_seg, pos_list[0]):
                    VP_count_dict = count_dict(VP_count_dict, first_seg)
                    cnt_opinion[1] += 1
                complex_word = merge_morph
                complex_pos = merge_pos
                if dp == list_dp[m_id]['e_id'] and 'VP' in list_dp[m_id]['gr']:
                    if not isSplit:
                        ner_skip = check_ner(list_dp[m_id]['ner'])
                        if ner_skip :
                            new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, merge_pos, "+".join(ner_list), dp, "+".join(gr_list))
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                            continue
                        elif dp == list_dp[m_id]['e_id'] and 'VP' in list_dp[m_id]['gr']:
                            complex_count += 1
                            second_seg, isSplit = get_VP_word(list_dp[m_id]['morph'], list_dp[m_id]['pos'], list_dp[m_id]['gr'])
                            if isOpinion(second_seg, list_dp[m_id]['pos'][0]):
                                VP_count_dict = count_dict(VP_count_dict, second_seg)
                                cnt_opinion[2] += 1
                            complex_word = join_morph(e_id, list_dp[m_id]['e_id'], complex_word, second_seg)
                            complex_pos = merge_pos +"+"+"+".join(list_dp[m_id]['pos'])
                            
                            if isOpinion(first_seg, pos_list[0]) and isOpinion(second_seg, pos_list[0]):
                                VP_count_dict = count_dict(VP_count_dict, complex_word)
                                cnt_opinion[3] += 1
                            complex_word = join_morph(e_id, list_dp[m_id]['e_id'], merge_morph, "".join(list_dp[m_id]['morph']))
                            new_dict_dp = make_dp_dict([e_id, list_dp[m_id]['e_id']], dict_cnt, complex_word, complex_pos, ner_list[0], list_dp[m_id]['dp'], "VP")
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                            if second_seg != "" :
                                mutual_word_count_dict = count_dict(mutual_word_count_dict, merge_morph+"+"+second_seg)
                        else:
                            new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, complex_pos, ner_list[0], dp, "VP")
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                    else:
                        new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, complex_pos, ner_list[0], dp, "VP")
                        dict_cnt += 1
                        new_list_dp.append(new_dict_dp)  
                else:
                    new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, complex_pos, ner_list[0], dp, "VP")
                    dict_cnt += 1
                    new_list_dp.append(new_dict_dp)   
                    
            #(D) 용언 수식 어구(AP)
            elif gr_list[0] == 'AP':
                first_seg, isSplit = get_VP_word(morph_list, pos_list, gr_list)
                #AP는 VP로 넣지 않음
                complex_word = merge_morph
                complex_pos = merge_pos
                
                if dp == list_dp[m_id]['e_id'] and 'VP' in list_dp[m_id]['gr']:
                    if not isSplit: 
                        ner_skip = check_ner(list_dp[m_id]['ner'])
                        if ner_skip :
                            new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, merge_pos, "+".join(ner_list), dp, "+".join(gr_list))
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                            continue
                        elif dp == list_dp[m_id]['e_id'] and 'VP' in list_dp[m_id]['gr']:
                            complex_count += 1
                            second_seg, isSplit = get_VP_word(list_dp[m_id]['morph'], list_dp[m_id]['pos'], list_dp[m_id]['gr'])
                            if isOpinion(second_seg, list_dp[m_id]['pos'][0]):
                                VP_count_dict = count_dict(VP_count_dict, second_seg)
                                cnt_opinion[4] += 1
                            complex_word = join_morph(e_id, list_dp[m_id]['e_id'], first_seg, second_seg)
                            complex_pos = merge_pos + "+" + "+".join(list_dp[m_id]['pos'])
                            complex_word = join_morph(e_id, list_dp[m_id]['e_id'], merge_morph, "".join(list_dp[m_id]['morph']))
                            new_dict_dp = make_dp_dict([e_id, list_dp[m_id]['e_id']], dict_cnt, complex_word, complex_pos, ner_list[0], list_dp[m_id]['dp'], "VP")
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                            if isOpinion(merge_morph, pos_list[0]) and isOpinion(second_seg, pos_list[0]):
                                VP_count_dict = count_dict(VP_count_dict, complex_word)
                                cnt_opinion[5] += 1
                            if second_seg != "" :
                                mutual_word_count_dict = count_dict(mutual_word_count_dict, first_seg+"+"+second_seg)
                        else:
                            new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, complex_pos, ner_list[0], dp, "VP")
                            dict_cnt += 1
                            new_list_dp.append(new_dict_dp)
                    else:
                        new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, complex_pos, ner_list[0], dp, "VP")
                        dict_cnt += 1
                        new_list_dp.append(new_dict_dp)
                else:
                    new_dict_dp = make_dp_dict([e_id], dict_cnt, complex_word, complex_pos, ner_list[0], dp, "VP")
                    dict_cnt += 1
                    new_list_dp.append(new_dict_dp)
            else:
                if 'NP' in gr_list:
                    first_seg, isSplit = get_NP_word(morph_list, gr_list)
                    if isAspect(first_seg, pos_list[0], ner_list[0]):
                        NP_count_dict = count_dict(NP_count_dict, first_seg)
                        cnt_aspect[7] += 1
                
                if 'VP' in gr_list:
                    first_seg, isSplit = get_VP_word(morph_list, pos_list, gr_list)
                    if isOpinion(first_seg, pos_list[0]):
                        VP_count_dict = count_dict(VP_count_dict, first_seg)
                        cnt_opinion[6] += 1
                    
                new_dict_dp = make_dp_dict([e_id], dict_cnt, merge_morph, pos_list[0], ner_list[0], dp, "+".join(gr_list))
                dict_cnt += 1
                new_list_dp.append(new_dict_dp)
        new_dp_data.append(new_list_dp)    

    SBJ_count_sorted = sorted(SBJ_count_dict.items(),reverse=True, key=lambda item: item[1])
    NP_count_sorted = sorted(NP_count_dict.items(),reverse=True, key=lambda item: item[1])
    VP_count_sorted = sorted(VP_count_dict.items(),reverse=True, key=lambda item: item[1])
    single_word_count_dict = sorted(single_word_count_dict.items(),reverse=True, key=lambda item: item[1])
    mutual_word_count_dict = sorted(mutual_word_count_dict.items(),reverse=True, key=lambda item: item[1])
    ret_list = list()
    ret_list.append({"SBJ_"+key: SBJ_count_sorted})
    ret_list.append({"NP_"+key: NP_count_sorted})
    ret_list.append({"VP_"+key: VP_count_sorted})
    ret_list.append({"single": single_word_count_dict})
    ret_list.append({"mutual": mutual_word_count_dict})
    for cnt_o in cnt_opinion:
        print(cnt_o, end=" ")
    print()
    for cnt_a in cnt_aspect:
        print(cnt_a, end=" ")
    print()
    return ret_list, new_dp_data

print("start extract_np_vp")

dp_merge= pd.read_json(f'{dir}/dp_merge.json', encoding = 'utf-8')

print(len(dp_merge["data"]))

morph_list, new_df =pair_count_extend(dp_merge["data"], "morph")

print("NP_morph: ", len(morph_list[1]['NP_morph']))
print("VP_morph: ", len(morph_list[2]['VP_morph']))
print("single: ", len(morph_list[3]['single'])) 
print("mutual: ", len(morph_list[4]['mutual'])) 

import json
with open(f'{dir}/NP_morph_count.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(morph_list[1]['NP_morph'], ensure_ascii=False))
with open(f'{dir}/VP_morph_count.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(morph_list[2]['VP_morph'], ensure_ascii=False))
print("pair_count_extend ... DONE")

# save dp data
line_len = len(new_df)
dp_list = list()
for i in range(line_len):
    morph_dict = dict()
    morph_dict['RawSentence'] = dp_merge['RawSentence'][i]
    morph_dict['data'] = new_df[i]
    dp_list.append(morph_dict)
print(len(dp_list))

with open(f'{dir}/dp_data.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(dp_list, ensure_ascii=False))

print("write dp_data ... DONE")

# more 3 NP
NP_list = list()
for i in range(len(morph_list[1]['NP_morph'])):
    if morph_list[1]['NP_morph'][i][1]>=3:
        NP_list.append(morph_list[1]['NP_morph'][i][0])
ret_list = [{'NP':NP_list}]
print(len(NP_list))
with open(f'{dir}/NP_morph_list_over3.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(ret_list, ensure_ascii=False))

# more 5 VP
VP_list = list()
for i in range(len(morph_list[2]['VP_morph'])):
    if morph_list[2]['VP_morph'][i][1]>=5:
        VP_list.append(morph_list[2]['VP_morph'][i][0])
ret_list = [{'VP':VP_list}]
print(len(VP_list))
with open(f'{dir}/VP_morph_list_over5.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(ret_list, ensure_ascii=False))
print("save np vp ... DONE")

## 후보군에서 Aspect, Opinion Pair의 후보 생성

In [None]:
import pandas as pd
import json
from aeop import tag_aeop
import time

# 5개이상 오피니언 후보
VP_morph_list= pd.read_json(f'{dir}/VP_morph_list_over5.json', encoding = 'utf-8')

# 3개이상 오피니언 후보
NP_morph_list= pd.read_json(f'{dir}/NP_morph_list_over3.json', encoding = 'utf-8')
print(len(VP_morph_list['VP'][0]), len(NP_morph_list['NP'][0]))
#dp 데이터
dp_all= pd.read_json(f'{dir}/dp_data.json', encoding = 'utf-8')
dp_a = dp_all.iloc[:500]
vp_list=VP_morph_list['VP'][0]
vp_list.sort(key=len, reverse=True)
np_list=NP_morph_list['NP'][0]
np_list.sort(key=len, reverse=True)
print(vp_list[:4])
print(np_list[:4])

print(time.strftime('%c', time.localtime(time.time())))
start = time.time() 
aeop_list, VP_NP_list1, VP_NP_list2  = tag_aeop(dp_all, vp_list, np_list)
end = time.time()
print(len(VP_NP_list1), len(VP_NP_list2))
print(f"{end - start:.5f} sec")
print(time.strftime('%c', time.localtime(time.time())))

with open(f'{dir}/aeop_list.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(aeop_list, ensure_ascii=False))
with open(f'{dir}/VP_NP_list1.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(VP_NP_list1, ensure_ascii=False))
with open(f'{dir}/VP_NP_list2.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(VP_NP_list2, ensure_ascii=False))

NP_ret_dict1 = dict()
NP_ret_dict2 = dict()
NP_ret_dict_total = dict()
for pair in VP_NP_list1:
    if pair[0] in NP_ret_dict1:
        NP_ret_dict1[pair[0]] += 1
    else:
        NP_ret_dict1[pair[0]] = 1
    if pair[0] in NP_ret_dict_total:
        NP_ret_dict_total[pair[0]] += 1
    else:
        NP_ret_dict_total[pair[0]] = 1
for pair in VP_NP_list2:
    if pair[0] in NP_ret_dict2:
        NP_ret_dict2[pair[0]] += 1
    else:
        NP_ret_dict2[pair[0]] = 1
    if pair[0] in NP_ret_dict_total:
        NP_ret_dict_total[pair[0]] += 1
    else:
        NP_ret_dict_total[pair[0]] = 1

NP_sorted_total = sorted(NP_ret_dict_total.items(),
                  reverse=True,
                  key=lambda item: item[1])

with open(f'{dir}/NP_total_count.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(NP_sorted_total, ensure_ascii=False))

## Pair 등장 횟수순으로 Aspect, Opinion 생성

In [None]:
import pandas as pd
import json
import csv

# aspect 후보 구하기
with open(f'{dir}/NP_total_count.json', 'r', encoding='utf-8') as f:
    NP_sorted_total = json.load(f)
np_list_total = list()
for word in NP_sorted_total:
    np_list_total.append(word[0])
print(np_list_total[:5])

# np, vp 만 추출 (count 제거)하여 np_morph_list와 vp_morph_list 만듦
np= pd.read_json(f'{dir}/NP_morph_count.json', encoding = 'utf-8')
vp= pd.read_json(f'{dir}/VP_morph_count.json', encoding = 'utf-8')

np_morph_list = list()
for i in range(len(np[0])):
    np_morph_list.append([np[0][i], int(np[1][i])])
vp_morph_list = list()
for i in range(len(vp[0])):
    vp_morph_list.append([vp[0][i], int(vp[1][i])])

with open(f'{csv_dir}/VP_opinion.csv', 'w', newline='', encoding='utf-8') as f: 
    # using csv.writer method from CSV package 
    write = csv.writer(f)
    write.writerows(vp_morph_list)

print(len(np_morph_list)) 
print(np_morph_list[:5]) 
print(len(vp_morph_list)) 
print(vp_morph_list[:5])

# result_np_list 만들기 (np만 뽑아내기, count 제거)
size = len(np_morph_list)
result_np_list = list()
result_np_count_list = list()
for i in range(size):
    if np_morph_list[i][0] in np_list_total:
        result_np_list.append(np_morph_list[i][0])
        result_np_count_list.append(np_morph_list[i])
    if i % 10000 == 1:
        print(i,"/",size,"...",round(i*100/size,2),"%") 
print(result_np_list[:5])
print(result_np_count_list[:5])


with open(f'{csv_dir}/NP_aspect.csv', 'w', newline='', encoding='utf-8') as f: 
    # using csv.writer method from CSV package 
    write = csv.writer(f)
    write.writerows(result_np_count_list)
    
print(len(result_np_count_list))


with open(f'{dir}/NP_aspect.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(result_np_count_list, ensure_ascii=False))

# pair 쌍 구하기
ao_list = list()
for i in range(len(NP_sorted_total)):
    if NP_sorted_total[i][0] in result_np_list:
        ao_list.append(NP_sorted_total[i])
print(len(ao_list))

with open(f'{dir}/VP_NP_list1.json', 'r', encoding='utf-8') as jsonfile:
    VP_NP_list1 = json.load(jsonfile)
with open(f'{dir}/VP_NP_list2.json', 'r', encoding='utf-8') as jsonfile:
    VP_NP_list2 = json.load(jsonfile)

def count_pair(np_list, vpnp_list, npvp_list):
    ao_list = list()
    ao_list_vpnp = list()
    ao_list_npvp = list()
    for i in range(len(npvp_list)):
        if npvp_list[i][0] in np_list:
            ao_list.append(npvp_list[i])
            ao_list_npvp.append(npvp_list[i])
    for i in range(len(vpnp_list)):
        if vpnp_list[i][1] in np_list:
            ao_list.append(vpnp_list[i])
            ao_list_vpnp.append(vpnp_list[i])
    print("ao_list_npvp", len(ao_list_npvp), ao_list_vpnp[:5])
    print("ao_list_vpnp", len(ao_list_vpnp), ao_list_npvp[:5])
    print("ao_list", len(ao_list), ao_list[:5])

    pair_dict = dict()
    pair_dict_vpnp = dict()
    pair_dict_npvp = dict()
    for i in range(len(ao_list)):
        ao_pair = ao_list[i][0] + "_" + ao_list[i][1]
        if ao_pair in pair_dict:
            pair_dict[ao_pair] += 1
        else:
            pair_dict[ao_pair] = 1
    for i in range(len(ao_list_vpnp)):
        ao_pair = ao_list_vpnp[i][0] + "_" + ao_list_vpnp[i][1]
        if ao_pair in pair_dict_vpnp:
            pair_dict_vpnp[ao_pair] += 1
        else:
            pair_dict_vpnp[ao_pair] = 1
    for i in range(len(ao_list_npvp)):
        ao_pair = ao_list_npvp[i][0] + "_" + ao_list_npvp[i][1]
        if ao_pair in pair_dict_npvp:
            pair_dict_npvp[ao_pair] += 1
        else:
            pair_dict_npvp[ao_pair] = 1
        
    pair_sorted = sorted(pair_dict.items(),
                    reverse=True,
                    key=lambda item: item[1])
    pair_sorted_vpnp = sorted(pair_dict_vpnp.items(),
                    reverse=True,
                    key=lambda item: item[1])
    pair_sorted_npvp = sorted(pair_dict_npvp.items(),
                    reverse=True,
                    key=lambda item: item[1])
    
    return pair_sorted, pair_sorted_vpnp, pair_sorted_npvp

pair_sorted, pair_sorted_vpnp, pair_sorted_npvp = count_pair(result_np_list, VP_NP_list1, VP_NP_list2)

with open(f'{dir}/pair_count_vpnp.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(pair_sorted_vpnp, ensure_ascii=False))
with open(f'{dir}/pair_count_npvp.json', 'w', encoding='utf-8') as jsonfile:
    jsonfile.write(json.dumps(pair_sorted_npvp, ensure_ascii=False))

with open(f'{csv_dir}/pair_count_vpnp.csv', 'w', newline='', encoding='utf-8') as f: 
    write = csv.writer(f)
    write.writerows(pair_sorted_vpnp)
with open(f'{csv_dir}/pair_count_npvp.csv', 'w', newline='', encoding='utf-8') as f: 
    write = csv.writer(f)
    write.writerows(pair_sorted_npvp)

## 사전 데이터와 일치하는 Aspect 추출

In [None]:
#-*-coding:utf-8-*-
# requirement: 
def extract_aspect():
    mode_list = ["vpnp","npvp"]
    
    with open(f'{dir}/pair_count_vpnp.json', 'r', encoding='utf-8') as jsonfile:
        VP_NP_list = json.load(jsonfile) 
    with open(f'{dir}/pair_count_npvp.json', 'r', encoding='utf-8') as jsonfile:
        NP_VP_list = json.load(jsonfile)
    pos_pd = pd.read_csv(f'{csv_dir}/vp_pos.csv', encoding = 'utf-8')
    neg_pd = pd.read_csv(f'{csv_dir}/vp_neg.csv', encoding = 'utf-8')
    neu_pd = pd.read_csv(f'{csv_dir}/vp_neu.csv', encoding = 'utf-8')
    
    for mode in mode_list:
        pos_dict = dict()
        neg_dict = dict()
        neu_dict = dict()
        aspect_dict =dict()
        ret_list = list()
        ao_list = []
        if mode == 'vpnp':
            ao_list = VP_NP_list
            vp_idx = 0
            np_idx = -1
        else:
            ao_list = NP_VP_list
            vp_idx = -1
            np_idx = 0
        if ao_list == []:
            exit()
        for i in range(len(pos_pd['VP'])):
            pos_dict[pos_pd['VP'][i]] = pos_pd['polarity']
        for i in range(len(neg_pd['VP'])):
            neg_dict[neg_pd['VP'][i]] = neg_pd['polarity']
        for i in range(len(neu_pd['VP'])):
            neu_dict[neu_pd['VP'][i]] = neu_pd['polarity']
            
        for i in tqdm(range(len(ao_list)), desc=mode):
            ret_dict = dict()
            pair_list = ao_list[i][0].split("_")

            if len(pair_list) != 1:
                if pair_list[vp_idx] in pos_dict:
                    ret_dict['pair'] = ao_list[i][0]
                    ret_dict['cnt'] = ao_list[i][1]
                    ret_dict['polarity'] = 1
                    ret_list.append(ret_dict)
                    if pair_list[np_idx] in aspect_dict:
                        aspect_dict[pair_list[np_idx]].append(ao_list[i][0])
                    else:
                        aspect_dict[pair_list[np_idx]] = [ao_list[i][0]]
                elif pair_list[vp_idx] in neg_dict:
                    ret_dict['pair'] = ao_list[i][0]
                    ret_dict['cnt'] = ao_list[i][1]
                    ret_dict['polarity'] = -1
                    ret_list.append(ret_dict)
                    if pair_list[np_idx] in aspect_dict:
                        aspect_dict[pair_list[np_idx]].append(ao_list[i][0])
                    else:
                        aspect_dict[pair_list[np_idx]] = [ao_list[i][0]]
                elif pair_list[vp_idx] in neu_dict:
                    ret_dict['pair'] = ao_list[i][0]
                    ret_dict['cnt'] = ao_list[i][1]
                    ret_dict['polarity'] = 0
                    ret_list.append(ret_dict)
                    if pair_list[np_idx] in aspect_dict:
                        aspect_dict[pair_list[np_idx]].append(ao_list[i][0])
                    else:
                        aspect_dict[pair_list[np_idx]] = [ao_list[i][0]]
            
            result_list = list()
            for key in aspect_dict.keys():
                result_dict = dict()
                result_dict['NP'] = key
                result_dict['pair'] = aspect_dict[key]
                result_list.append(result_dict)
        aspect_dataframe = pd.DataFrame(result_list)
        ret_dataframe = pd.DataFrame(ret_list)
        aspect_dataframe.to_csv(f'{csv_dir}/extract_aspect_{mode}.csv',sep=',')
        ret_dataframe.to_csv(f'{csv_dir}/extract_pair_{mode}.csv',sep=',')

In [None]:
extract_aspect()

## Aspect Opinion쌍 생성

In [None]:
#-*-coding:utf-8-*-
def pairing_aspect():
    #,NP,pair
    aspect_file_vpnp = 'extract_aspect_vpnp'
    aspect_file_npvp = 'extract_aspect_npvp'
    pair_file_vpnp = 'extract_pair_vpnp'
    pair_file_npvp = 'extract_pair_npvp'
    aspect_vpnp = pd.read_csv(f'{csv_dir}/{aspect_file_vpnp}.csv', encoding = 'utf-8')
    aspect_npvp = pd.read_csv(f'{csv_dir}/{aspect_file_npvp}.csv', encoding = 'utf-8')
    pair_vpnp = pd.read_csv(f'{csv_dir}/{pair_file_vpnp}.csv', encoding = 'utf-8')
    pair_npvp = pd.read_csv(f'{csv_dir}/{pair_file_npvp}.csv', encoding = 'utf-8')

    #pair to dictionary
    pair_dict = dict()
    for dep_pair in [pair_npvp, pair_vpnp]:
        for i in range(len(dep_pair)):
            pair = dep_pair['pair'][i]
            cnt = dep_pair['cnt'][i]
            polarity = dep_pair['polarity'][i]
            if pair not in pair_dict:
                pair_dict[pair] = {"cnt":cnt, "polarity":polarity} 

    result_list = list()
    
    for i in tqdm(range(len(aspect_vpnp['NP'])), desc='vp_np pairing'):
        ret_dict = dict()
        ret_dict['NP'] = aspect_vpnp['NP'][i]
        ret_dict['count'] = 0
        ret_dict['polarity'] = []
        ret_dict['pair'] = []
        ret_dict['order'] = 'vpnp'
        try:
            pair_list = ast.literal_eval(aspect_vpnp['pair'][i])
            for pair in pair_list:
                if pair[0] == '없': #없는 이 앞에 오는 건 ban
                    continue
                if pair in pair_dict:
                    ret_dict['count'] += pair_dict[pair]['cnt']
                    ret_dict['polarity'].append(pair_dict[pair]['polarity'])
                    ret_dict['pair'].append(pair)
        except Exception as e:
            print(e)
            pass
        if ret_dict['count'] > 0:
            result_list.append(ret_dict)
    
    for i in tqdm(range(len(aspect_npvp['NP'])), desc='np_vp pairing'):
        ret_dict = dict()
        ret_dict['NP'] = aspect_npvp['NP'][i]
        ret_dict['count'] = 0
        ret_dict['polarity'] = []
        ret_dict['pair'] = []
        ret_dict['order'] = 'npvp'
        try:
            pair_list = ast.literal_eval(aspect_npvp['pair'][i])
            for pair in pair_list:
                ret_dict['count'] += pair_dict[pair]['cnt']
                ret_dict['polarity'].append(pair_dict[pair]['polarity'])
                ret_dict['pair'].append(pair)
        except Exception as e:
            print(e)
            pass
        if ret_dict['count'] > 0:
            result_list.append(ret_dict)
    result_sorted_list = sorted(result_list, key=lambda pair_dict: pair_dict['count'], reverse=True)        
    result_dataframe = pd.DataFrame(result_sorted_list)
    result_dataframe.to_csv(f'{csv_dir}/aspect_pair.csv',sep=',')
    

In [None]:
pairing_aspect()

## Aspect 그룹 태깅

In [None]:
def match_tag_aspect():
    # aspect의 관계쌍 파일 불러오기
    aspect_pair_file = 'aspect_pair'
    pair_tag_file = 'aspect_group'
    aspect_pair = pd.read_csv(f'{csv_dir}/{aspect_pair_file}.csv', encoding = 'utf-8')
    pair_tag = pd.read_csv(f'{csv_dir}/{pair_tag_file}.csv', encoding = 'utf-8')
    
    #np key로 딕셔너리에 저장
    ret_dict = dict()
    for i in range(len(pair_tag['NP'])):
        if pair_tag['tag'][i] == 'o':
            np = pair_tag['NP'][i]
            pair = pair_tag['pair'][i]
            tag = pair_tag['tag'][i]
            group = pair_tag['group'][i]
            ret_dict[np] = [np, pair, tag, group]
    
    # ,NP,count,polarity,pair
    aspect_pair_tag_list = list()
    except_pair_tag_list = list()
    for i in tqdm(range(len(aspect_pair['NP'])), desc='match tag aspect'):
        if aspect_pair['NP'][i] in ret_dict:
            #학습데이터 만들기 위한 aspect-group-polarity-pair 데이터 만들기
            aspect_pair_tag = dict()
            aspect_pair_tag['NP'] = aspect_pair['NP'][i]
            aspect_pair_tag['tag'] = ret_dict[aspect_pair['NP'][i]][2]
            aspect_pair_tag['group'] = ret_dict[aspect_pair['NP'][i]][3]
            aspect_pair_tag['count'] = aspect_pair['count'][i]
            aspect_pair_tag['polarity'] = aspect_pair['polarity'][i]
            aspect_pair_tag['pair'] = aspect_pair['pair'][i]
            aspect_pair_tag_list.append(aspect_pair_tag)
        else:
            #확인용
            except_pair_tag = dict()
            except_pair_tag['NP'] = aspect_pair['NP'][i]
            except_pair_tag['count'] = aspect_pair['count'][i]
            except_pair_tag['polarity'] = aspect_pair['polarity'][i]
            except_pair_tag['pair'] = aspect_pair['pair'][i]
            except_pair_tag_list.append(except_pair_tag)
    tag_dataframe = pd.DataFrame(aspect_pair_tag_list)
    except_dataframe = pd.DataFrame(except_pair_tag_list)
    tag_dataframe.to_csv(f'{csv_dir}/match_tag_aspect.csv',sep=',')
    except_dataframe.to_csv(f'{csv_dir}/except_tag_aspect.csv',sep=',')

In [None]:
match_tag_aspect()

## ABSA 학습 데이터 생성

In [None]:
# RawSentence랑 aeop 태깅된 데이터 불러오기
class bert_train():
    def __init__(self):
        self.dp_aeop = pd.read_json(f'{dir}/aeop_list.json', encoding = 'utf-8')
        train_list = self.make_train()
        aeop_tag_train_list, other_train_list = self.arrange_train_list(train_list)
        train_list = self.merge_train_data(aeop_tag_train_list, other_train_list, 0)
        total_size = len(train_list)
        print("Total Dataset Size:", len(train_list))
        train_size = total_size * 9 //10
        print("Train Dataset Size:", len(train_list) - train_size)
        print("Test Dataset Size:", train_size)
        self.write_tsv(train_list[:train_size], "train")
        self.write_tsv(train_list[train_size:], "test")

    def make_group_dict(self):
        # tag group이 매치된 데이터 불러오기 
        match_tag= pd.read_csv(f'{csv_dir}/match_tag_aspect.csv', encoding = 'utf-8')
        # group에 대해 dictionary 만들기: group_dict
        group_dict = dict()
        for i in tqdm(range(len(match_tag)), desc='make group dict'):
            group_list = ast.literal_eval(match_tag['group'][i])
            pair_list = ast.literal_eval(match_tag['pair'][i])
            if len(group_list) == 1:
                np = match_tag['NP'][i]
                if np in group_dict:
                    group_dict[np]['pair'] += pair_list
                else:
                    group_dict[np] = {"tag": tag_dict[group_list[0]],
                                    "pair": pair_list}
        return group_dict

    def write_tsv(self, line_list, filename):
        with open(f'{train_dir}/{filename}.tsv', 'w', encoding='utf-8') as file: 
            for line in line_list: 
                file.write(line)
                
    def compare_init_target(self, target, morph):
        flag_all = True
        target_split = target.split(" ")
        morph_split = morph.split(" ")
        #띄어쓰기 개수 다르면 false
        if len(morph_split) != len(target_split):
            flag_all = False
        elif len(morph_split) > 1:
            #  두 어절 이상이면 첫 어절이 같지 않으면 false 
            for i in range(len(morph_split)-1):
                if(morph_split[i] != target_split[i]):
                    flag_all = False
        return flag_all

    def arrange_train_list(self, train_list):
        d_count = 0 # 중복값
        nk_count = 0 # 한국어 아닌 값
        t_count = 0 # 태깅 이상한 값
        nTrain = len(train_list)
        train_list_sorted = sorted(train_list, key=str.lower)
        new_train_list = list()
        new_other_list = list()
        before_line = ""
        for line in tqdm(train_list_sorted, desc='arrange train list'):
            segments = line.split(" ")
            english = re.compile('[a-zA-Z0-9]+')
            isNotKo = english.match(segments[0])
            if isNotKo:
                nk_count += 1
            elif line == before_line:
                d_count += 1
            else:
                temp = line.split('\n')
                split_line = temp[0].split('\t')
                words_list = split_line[0].split(" ")
                aspect_tags = split_line[1]
                tags_list = aspect_tags.split(" ")
                if len(words_list) != len(tags_list):
                    t_count += 1
                    continue
                
                find_tag = False
                for tag in tags_list:
                    if tag != 'O':
                        find_tag = True
                if find_tag:
                    new_train_list.append(line)
                else :
                    new_other_list.append(line)
                before_line = line
        nTrainResult = len(new_train_list)
        nOtherResult = len(new_other_list)
        print(f"한국어 아닌 데이터 {nk_count}개, 중복 데이터 {d_count}개, 태그 개수 안맞는 {t_count}개 제거")
        print(f"{nTrain}개 학습데이터 중 {nTrainResult}개 AEOP 데이터, {nOtherResult}개 other 데이터 확보")
        
        return new_train_list, new_other_list

    def merge_train_data(self, tag_list, other_list, per = 20):
        tag_size = len(tag_list)
        other_size = tag_size * per // 100
        train_list = tag_list + other_list[:other_size]
        random.shuffle(train_list)
        return train_list

    def make_train(self):
        train_list = list()
        nSentence = len(self.dp_aeop['RawSentence'])
        group_dict = self.make_group_dict()
        for i in tqdm(range(nSentence), desc='make_train'):
            nData = len(self.dp_aeop['data'][i])
            morph_list = self.dp_aeop['data'][i]
            
            total_list = list()
            for j in range(nData):
                morph = morph_list[j]['morph']
                word_split = morph.split(" ")
                    
                if morph_list[j]['aeop'] == "":
                    total_list.append([j,morph,'O'])
                
                elif morph_list[j]['aeop'][0] == 'A':
                    #aspect를 찾았으면 태깅 시작
                    target_ret = [j, morph, 'O']
                    for k in range(nData):
                        if morph_list[k]['aeop'] == "":
                            total_list.append([j,morph,'O'])
                            continue
                        elif morph_list[k]['aeop'][0] == 'O':
                            #opinion 찾았으면 쌍 있는지 탐색
                            replace_morph = morph
                            find_opinion = morph_list[k]['morph']
                            if morph in group_dict:
                                replace_morph = morph
                            else:
                                morph_split = morph.split(" ")
                                if len(morph_split[-1]) > 2:
                                    if morph[:-1] in group_dict:
                                        replace_morph = morph[:-1]
                                    else:
                                        continue
                                else:
                                    continue
                            morph_pair_list = group_dict[replace_morph]['pair']
                            
                            for pair in morph_pair_list:
                                pair_split = pair.split("_")
                                if pair_split[0] == replace_morph:
                                    #npvp의 경우
                                    vp_idx = 1
                                elif pair_split[1] == replace_morph:
                                    #vpnp의 경우
                                    vp_idx = 0
                                
                                vp = pair_split[vp_idx]
                                vp_size = len(vp)
                                vp_split = vp.split(" ")
                                opinion_split = find_opinion.split(" ")
                                if vp_size > len(find_opinion):
                                    continue
                                if vp == find_opinion:
                                    total_list.append([k, find_opinion, 'OPN'])
                                    target_ret = [j, morph, 'ASP-'+group_dict[replace_morph]['tag']]
                                elif (len(vp_split) == len(opinion_split)
                                and vp == find_opinion[:vp_size]):
                                    total_list.append([k, find_opinion, 'OPN'])
                                    target_ret = [j, morph, 'ASP-'+group_dict[replace_morph]['tag']]
                    total_list.append(target_ret)
                else:
                    total_list.append([j,morph,'O'])

            sentence_dict = dict()
            for total in total_list:
                if total[0] in sentence_dict:
                    if total[2] != 'O':
                        sentence_dict[total[0]] = {"word":total[1], "tag":total[2]}
                else:
                    sentence_dict[total[0]] = {"word":total[1], "tag":total[2]}
            words_list = list()
            tags_list = list()
            for s_id in range(len(sentence_dict.keys())):
                word_split = sentence_dict[s_id]['word'].split(' ')
                tag = sentence_dict[s_id]['tag']
                word_cnt = 0
                for word in word_split:
                    if tag == 'O':
                        tags_list.append(tag)
                    elif word_cnt == 0:
                        tags_list.append(tag+'-B')
                    else:
                        tags_list.append(tag+'-I')
                    words_list.append(word)
                    word_cnt += 1
                    
            train_line = " ".join(words_list) + '\t' + " ".join(tags_list) + '\n'
            train_list.append(train_line)
            
        return train_list

In [None]:
bert_train()

# KoBERT-ABSA 학습

## Train & Eval

Parameters
|Param|Desc|
|:----:|----|
|model_dir|저장된 모델 디렉토리(absa_model)|
|data_dir|데이터셋 디렉토리(absa_dataset)|
|train_batch_size|(Default: 32)|
|eval_batch_size|(Default: 64)|
|num_train_epochs|학습 Epoch(Default: 20)|
|do_train|training 모드|
|do_eval|eval 모드, training 모드와 함께 사용시 1,000 step 마다 Eval|

In [None]:
!python main.py --data_dir=absa_dataset --model_dir=absa_model --do_train --do_eval --train_batch_size=16 --num_train_epochs=20

## Predict

Parameters
|Param|Desc|
|:----:|----|
|model_dir|저장된 모델 디렉토리(model)|
|input_file|Input 파일명|
|output_file|Output 파일명|
|batch_size|(Default: 32)|

In [None]:
!python predict.py --model_dir=absa_model --input_file ./absa_dataset/test.txt