#1. 필요한 라이브러리 임포트

In [None]:
# KoNLPy와 Kkma 설치
!pip install konlpy

# Java 설치 (KoNLPy에서 Kkma를 사용하기 위해 필요)
!apt-get install -y openjdk-11-jdk

# JPype1 설치 (KoNLPy가 Java와 상호작용하기 위해 필요)
!pip install jpype1

# KoNLPy에서 필요한 추가 파일 설치
!apt-get install -y curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

#2. 점수 계산 함수 구현

In [1]:
from konlpy.tag import Kkma
from collections import Counter
import math

# Kkma 형태소 분석기 생성
kkma = Kkma()

# n-gram 생성 함수
def get_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

# ROUGE 계산 함수
def calculate_rouge(reference, candidate, n=1):
    # n-gram 생성
    ref_ngrams = get_ngrams(reference, n)
    cand_ngrams = get_ngrams(candidate, n)

    # n-gram 카운트
    ref_counter = Counter(ref_ngrams)
    cand_counter = Counter(cand_ngrams)

    # 교집합 개수
    overlap = sum((ref_counter & cand_counter).values())

    # Precision, Recall, F1 계산
    if len(cand_ngrams) == 0:
        precision = 0.0
    else:
        precision = overlap / len(cand_ngrams)

    if len(ref_ngrams) == 0:
        recall = 0.0
    else:
        recall = overlap / len(ref_ngrams)

    if precision + recall == 0:
        f1_score = 0.0
    else:
        f1_score = 2 * precision * recall / (precision + recall)

    return precision, recall, f1_score

# BLEU 계산 함수
def calculate_bleu(reference, candidate, max_n=4):
    precisions = []
    for n in range(1, max_n+1):
        ref_ngrams = get_ngrams(reference, n)
        cand_ngrams = get_ngrams(candidate, n)

        ref_counter = Counter(ref_ngrams)
        cand_counter = Counter(cand_ngrams)

        overlap = sum((ref_counter & cand_counter).values())
        precision = overlap / len(cand_ngrams) if len(cand_ngrams) > 0 else 0
        precisions.append(precision)

    # Geometric mean of the precisions
    if all(p == 0 for p in precisions):
        bleu_score = 0
    else:
        bleu_score = math.exp(sum([math.log(p) if p > 0 else -999999 for p in precisions]) / max_n)

    # Brevity Penalty
    ref_len = len(reference)
    cand_len = len(candidate)
    brevity_penalty = math.exp(1 - ref_len / cand_len) if cand_len < ref_len else 1

    bleu_score *= brevity_penalty

    return bleu_score
# ROUGE-L 계산 (Longest Common Subsequence 기반)
def lcs(X, Y):
    m = len(X)
    n = len(Y)
    dp = [[0] * (n + 1) for i in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i - 1] == Y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

    return dp[m][n]

#3. 점수 계산해보기

In [2]:
# 형태소 분석 결과 생성
reference = "고양이가 나무 위로 올라갔다."
candidate = "나무 위로 고양이가 올라갔다."

reference_tokens = kkma.morphs(reference)
candidate_tokens = kkma.morphs(candidate)
print("문장 1 형태소 분석 결과: ", reference_tokens)
print("문장 2 형태소 분석 결과: ", candidate_tokens)
# ROUGE-1 계산
precision_rouge1, recall_rouge1, f1_rouge1 = calculate_rouge(reference_tokens, candidate_tokens, n=1)
print(f"ROUGE-1 -> Precision: {precision_rouge1}, Recall: {recall_rouge1}, F1: {f1_rouge1}")

# ROUGE-2 계산
precision_rouge2, recall_rouge2, f1_rouge2 = calculate_rouge(reference_tokens, candidate_tokens, n=2)
print(f"ROUGE-2 -> Precision: {precision_rouge2}, Recall: {recall_rouge2}, F1: {f1_rouge2}")


lcs_length = lcs(reference_tokens, candidate_tokens)
precision_rougeL = lcs_length / len(candidate_tokens) if len(candidate_tokens) > 0 else 0
recall_rougeL = lcs_length / len(reference_tokens) if len(reference_tokens) > 0 else 0
f1_rougeL = 2 * precision_rougeL * recall_rougeL / (precision_rougeL + recall_rougeL) if precision_rougeL + recall_rougeL > 0 else 0

print(f"ROUGE-L -> Precision: {precision_rougeL}, Recall: {recall_rougeL}, F1: {f1_rougeL}")

# BLEU 계산
bleu_score = calculate_bleu(reference_tokens, candidate_tokens, max_n=4)
print(f"BLEU -> Score: {bleu_score}")

문장 1 형태소 분석 결과:  ['고양이', '가', '나무', '위', '로', '올라가', '었', '다', '.']
문장 2 형태소 분석 결과:  ['나무', '위', '로', '고양이', '가', '올라가', '었', '다', '.']
ROUGE-1 -> Precision: 1.0, Recall: 1.0, F1: 1.0
ROUGE-2 -> Precision: 0.75, Recall: 0.75, F1: 0.75
ROUGE-L -> Precision: 0.7777777777777778, Recall: 0.7777777777777778, F1: 0.7777777777777778
BLEU -> Score: 0.48109772909788073


In [3]:

def score(reference, candidate):

    reference_tokens = kkma.morphs(reference)
    candidate_tokens = kkma.morphs(candidate)
#     print("문장 1 형태소 분석 결과: ", reference_tokens)
#     print("문장 2 형태소 분석 결과: ", candidate_tokens)
    # ROUGE-1 계산
    precision_rouge1, recall_rouge1, f1_rouge1 = calculate_rouge(reference_tokens, candidate_tokens, n=1)
#     print(f"ROUGE-1 -> Precision: {precision_rouge1}, Recall: {recall_rouge1}, F1: {f1_rouge1}")

    # ROUGE-2 계산
    precision_rouge2, recall_rouge2, f1_rouge2 = calculate_rouge(reference_tokens, candidate_tokens, n=2)
#     print(f"ROUGE-2 -> Precision: {precision_rouge2}, Recall: {recall_rouge2}, F1: {f1_rouge2}")


    lcs_length = lcs(reference_tokens, candidate_tokens)
    precision_rougeL = lcs_length / len(candidate_tokens) if len(candidate_tokens) > 0 else 0
    recall_rougeL = lcs_length / len(reference_tokens) if len(reference_tokens) > 0 else 0
    f1_rougeL = 2 * precision_rougeL * recall_rougeL / (precision_rougeL + recall_rougeL) if precision_rougeL + recall_rougeL > 0 else 0

#     print(f"ROUGE-L -> Precision: {precision_rougeL}, Recall: {recall_rougeL}, F1: {f1_rougeL}")

    # BLEU 계산
    bleu_score = calculate_bleu(reference_tokens, candidate_tokens, max_n=4)
#     print(f"BLEU -> Score: {bleu_score}")
    return f1_rouge1, f1_rougeL, bleu_score

# format 별 인식

In [14]:
import json

# file_path = ['trans_html.jsonl_inference2.jsonl','trans_xml.jsonl_inference2.jsonl','trans_mkdw1.jsonl_inference2.jsonl']#,'trans_mkdw2.jsonl_inference.jsonl']
file_path = ['trans_xml.jsonl_inference_finetuned.jsonl']
all_json = []

for path in file_path:
    with open(path, 'r', encoding='utf-8') as file:  # Use 'path' instead of 'file_path'
        jsonl_data = [json.loads(line) for line in file]
        all_json.append(jsonl_data)

In [27]:
text = all_json[0][0]
print(all_json[0][2])

[|system|]You are EXAONE model from LG AI Research, a helpful assistant.[|endofturn|]
[|user|]다음 document를 활용하여 표의 highlighted 부분을 설명해주세요. document: {'table_title': '2019년도 국가지방협력 특별교부세 지역사랑상품권 발행지원 현황', 'highlighted_cells': [[0, 1], [1, 1], [3, 1]], 'table_xml': '<table><row><value>구 분</value><is_header>True</is_header><col>0</col><colspan>1</colspan><row>0</row><rowspan>1</rowspan></row><row><value>교부 지자체</value><is_header>True</is_header><col>1</col><colspan>1</colspan><row>0</row><rowspan>1</rowspan></row><row><value>지원비율</value><is_header>True</is_header><col>2</col><colspan>1</colspan><row>0</row><rowspan>1</rowspan></row><row><value>지원액</value><is_header>True</is_header><col>3</col><colspan>1</colspan><row>0</row><rowspan>1</rowspan></row><row><value>보통교부세 교부단체</value><is_header>False</is_header><col>0</col><colspan>1</colspan><row>1</row><rowspan>1</rowspan></row><row><value>인천 본청 등 132개</value><is_header>False</is_header><col>1</col><colspan>1</colspan><row>1</row><rowspan>1</

In [19]:
import re
# 정규식을 사용하여 '[|assistant|]'와 '[|endofturn|]' 사이의 텍스트 추출
pattern = r"\[\|assistant\|\](.*?)\[\|endofturn\|\]"
matches = re.search(pattern, text, re.DOTALL)

# 추출된 텍스트 출력
extracted_text = matches.group(1).strip() if matches else None
print(extracted_text.split('\n')[-1])

이 표는 영국의 우회이익세(DPT) 과세 실적을 연도별로 보여주고 있으며, 각 연도의 신고 건수, 당초 예상 세수, 실제 징수된 총 세수를 비교하고 있습니다.


In [24]:
import json

file_path = ['trans_html.jsonl','trans_xml.jsonl','trans_mkdw1.jsonl','trans_mkdw2.jsonl']
all_target = []

for path in file_path:
    with open(path, 'r', encoding='utf-8') as file:  # Use 'path' instead of 'file_path'
        jsonl_data = [json.loads(line) for line in file]
        all_target.append(jsonl_data)

In [25]:
all_target[0][0]['output']

'영국의 우회이익세(DPT) 과세실적을 살펴보면 2015/2016년 31만 파운드에서 2017/2018년에는 388만 파운드로 10배 이상 증가한 것으로 나타났다.'

In [26]:
score(all_target[0][0]['output'], extracted_text)

(0.0963455149501661, 0.05647840531561462, 0.02158291185419082)

# 전체

In [11]:
scores = []
for pre_json, tar_json in zip(all_json, all_target):
    temp = []
    for pre_row, tar_row in zip(pre_json, tar_json):
        pattern = "[|assistant|]"
        extracted_prediction = pre_row.split(pattern)[-1].split('\n')[-1]
        extracted_target = tar_row['output']
        try:
            rouge1_f1, rougeN_f1, bleu = score(extracted_target,extracted_prediction)
            temp.append((rouge1_f1, rougeN_f1, bleu))
        except:
            print(f"{extracted_target}, {extracted_prediction} : error")
    scores.append(temp)

비수도권의 전반적 평가가 6.78점으로 높고, 농어촌 생활권에서 6.82점으로 높은 개선 수준을 나타냈다., 14\�>\n>\n>\n>\n>\n>\n>\n\n>\n>\n>\n>\n>\n>\n>\>\>\>\>>\>\>\>>\�>>\>\>>>\>>>>>>>>>>>>>>\비즈>\�>>>>>>>>>>>>>>>>>>>>\�>>>>>>>>, 10>>>>, 10>>>>>>>>>>>>>>, 12>, 1, 1>>, 22>>, 1>>, 1>>, 1, 20>>>>>>>>>>>>\�>>>>>>>>>>>>>, 1>>>, 2\>>>(1>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>{\산이\�\�>>>>>>>\1>\�산규규규규>>\�>\�>>>\�>\�\1, 1, 1, 1, 1, 1\\\규규규규\�\�\�\\1, 1, 1, 1, 이는, 규제규규규규규규표규규규규규규, 1, 3, 규제가 1, 이는,, 과학입니다. 1, 규규, 1, 1, 1, 규규규, 1>, 1, 1, 1, 1, 1규규규규 1 1규규규규 1, 규   규규규규  규규   규규규 1 1, 규규 , 규규규규규규규규규규규규규규규규규규규규규규규규규규규규 규규규규규규를규규규규규규규1표규규규규규규세포규규규규,1세포세포1,1>>>>>>\규규규규규규규규규규규규규규규규규규규규규규규규규세포>, 1, 1 1>>>>>>>11>>>>>>>1>>>>>>>>>>>>>>>>>>>\\1>>>>>>>>>>1111규1>>11규규규1>,1입력규1>규규10\표규1000 1>>10   1 11 1>1   (1 1규규1, 0 00 1규규1000 1규규규성1,1,1규111 1 00(100ع규10표규1010>10001표100011100110000110표표1010표표10,101000\00>0>0>00010>>0000규규100규규규규(0성규규규성규성0010000000000101000001010101010001001001101000001 101 00010 0 0 000000000000000010 0 00100000000000010000000101010100

In [12]:
mean_score = []
for score_ in scores:
    col1, col2, col3 = zip(*score_)
    mean_score.append([sum(col1)/len(col1), sum(col2)/len(col2), sum(col3)/len(col3)])

In [13]:
mean_score

[[0.3070855247264256, 0.23308296678703952, 0.05906915013695125],
 [0.2554960368523674, 0.19943523499089083, 0.04781624063796962],
 [0.35091652821807645, 0.27148195871828035, 0.07515620335621685]]

In [None]:
['trans_html.jsonl','trans_xml.jsonl','trans_mkdw1.jsonl','trans_mkdw2.jsonl']

### 출력 원본 score
- html: [0.12738789711537324, 0.09238321435613023, 0.015037952380729634],
- xml: [0.10157205956595884, 0.07564866534694506, 0.010214911541048937],
- mkdw1: [0.1340286287798465, 0.09848943221124995, 0.016972311369042585],
- mkdw2: [0.1375376548128706, 0.09993007429514748, 0.016385246340009194]

### 약간 포매팅 후 score
- html: [0.24166040595835606, 0.1777022497369232, 0.027341897428892665],
- xml: [0.18187266009261158, 0.13595813184081018, 0.017700276732859184],
- mkdw1: [0.24081715201135784, 0.17784569718924068, 0.02471643600928995],
- mkdw2: [0.23446849290218383, 0.17165369107269104, 0.022243427365346933]