### Chatbot의 응답 평가하기
**[생성도 평가]**
1. EM
2. F1 score
3. BLEU
**[생성 유사도]**
4. word2vec

In [1]:
import math
import re
import random
import urllib.request

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset

from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel

import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Chatbot

In [3]:
class Inference():
    def __init__(self):
        # special token
        self.BOS = "</s>"
        self.EOS = "</s>"
        self.UNK = "<unk>"
        self.PAD = "<pad>"
        self.MASK = "<unused0>"
        self.ENTER = "<ENTER>"

        self.Q_TKN = "<usr>"
        self.A_TKN = "<sys>"
        self.SENT = "<unused1>"

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
                                                                 bos_token=self.BOS,
                                                                 eos_token=self.EOS,
                                                                 unk_token=self.UNK,
                                                                 pad_token=self.PAD,
                                                                 mask_token=self.MASK)
        self.model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

    def model_load(self, path):

        state_dict = torch.load(path, map_location=self.device)

        new_state_dict = {}
        for k, v in state_dict.items():
            name = k[7:] if k.startswith('module.') else k  # remove `module.`
            new_state_dict[name] = v

        self.model.load_state_dict(new_state_dict)
        self.model.to(self.device)
        print("[+] Model load complete")

    def inference(self, msg):
        answer = ""
        while True:
            input_ids = torch.LongTensor(self.tokenizer.encode(self.Q_TKN + str(msg) + self.SENT + self.A_TKN + answer)).unsqueeze(dim=0).to(self.device)
            predict = self.model(input_ids)
            predict = predict.logits
            predict = self.tokenizer.convert_ids_to_tokens(torch.argmax(predict, dim=-1).squeeze().cpu().detach().numpy().tolist())[-1]

            if (predict == self.EOS) or (predict == self.PAD):
                break
            answer += predict.replace("▁", " ")

        return answer.strip()

In [4]:
# finetuned pt
model = Inference()
model.model_load("/content/drive/MyDrive/전남대/수업/24년도/자연어처리 Project/fintuning_4.pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[+] Model load complete


In [4]:
# backbone pt
model = Inference()
model.model_load("/content/drive/MyDrive/전남대/수업/24년도/자연어처리 Project/backbone.pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[+] Model load complete


In [5]:
data = pd.read_csv("/content/drive/MyDrive/전남대/수업/24년도/자연어처리 Project/finetuning_[포차코].csv", index_col=0)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Q       731 non-null    object
 1   A       731 non-null    object
dtypes: object(2)
memory usage: 17.1+ KB


In [9]:
model_a_list = []

for idx, q in tqdm.tqdm(enumerate(data["Q"])):
    try:
        model_a = model.inference(q)
        model_a_list.append(model_a)
    except RuntimeError as e:
        if "The size of tensor a" in str(e):
            model_a_list.append("NaN")
        else:
            raise e

In [None]:
data['model_a'] = model_a_list
data.to_csv("evaluation_data.csv", index = False)

In [38]:
backbone_data = pd.DataFrame({'Q':backbone_q_list,
                             'A':a_list,
                             'backbone_A':backbone_a_list})

In [41]:
backbone_data.to_csv("evaluation_backbone.csv", index = False)

## Evaluation

In [62]:
data = pd.read_csv("/content/drive/MyDrive/전남대/수업/24년도/자연어처리 Project/evaluation_data.csv")

In [43]:
backbone_data = pd.read_csv("/content/drive/MyDrive/전남대/수업/24년도/자연어처리 Project/evaluation_backbone.csv")

### Evaluation 1. EM

In [48]:
def exact_match(a, model_a):
    return a == model_a

def evaluate_exact_match(data):

    data['exact_match'] = data.apply(lambda x: exact_match(x['A'], x['backbone_A']), axis=1)

    total_samples = len(data)
    correct_predictions = data['exact_match'].sum()
    accuracy = correct_predictions / total_samples

    print(f"Exact Match Accuracy: {accuracy:.4f}")

In [34]:
# finetuned EM
evaluate_exact_match(data)

Exact Match Accuracy: 0.0044


In [49]:
# backbone EM
evaluate_exact_match(backbone_data)

Exact Match Accuracy: 0.0000


### Evaluation 2. F1 Score

In [51]:
from sklearn.metrics import f1_score
from collections import Counter

def f1_score_from_predictions(pred, label):
    # 토큰화
    pred_tokens = pred.split()
    label_tokens = label.split()

    # 정답과 예측 답변의 교집합 계산
    common = Counter(pred_tokens) & Counter(label_tokens)
    num_common = sum(common.values())

    # Precision, Recall 계산
    if num_common == 0:
        return 0.0, 0.0, 0.0
    precision = num_common / len(pred_tokens)
    recall = num_common / len(label_tokens)

    # F1-score 계산
    if precision + recall == 0:
        f1 = 0.0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1

In [19]:
# finetuned F1-score
scores = data.apply(lambda x: f1_score_from_predictions(x['A'], x['model_a']), axis=1)

# 결과 계산
precision = scores.apply(lambda x: x[0]).mean()
recall = scores.apply(lambda x: x[1]).mean()
f1 = scores.apply(lambda x: x[2]).mean()

print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

Precision: 0.01049519586104952, Recall: 0.013562453806356244, F1-score: 0.011091753774680604


In [52]:
# backbone F1-score
scores = backbone_data.apply(lambda x: f1_score_from_predictions(x['A'], x['backbone_A']), axis=1)

# 결과 계산
precision = scores.apply(lambda x: x[0]).mean()
recall = scores.apply(lambda x: x[1]).mean()
f1 = scores.apply(lambda x: x[2]).mean()

print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

Precision: 0.0011904761904761904, Recall: 0.0012152777777777776, F1-score: 0.0010416666666666667


### Evaluation 3. BLEU

In [50]:
import datasets

In [14]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.2


In [38]:
# finetuned BLEU
bleu_metric = datasets.load_metric("sacrebleu")

bleu_metric.add(
    prediction = data['model_a'], reference = [data['A']]
)

results = bleu_metric.compute(smooth_method = 'floor', smooth_value=0)
results['precisions'] =[np.round(p,2) for p in results["precisions"]]

pd.DataFrame.from_dict(results, orient = "index", columns=['Value'])

Unnamed: 0,Value
score,26.177519
counts,"[2946, 1819, 1208, 665]"
totals,"[4160, 4159, 4158, 4157]"
precisions,"[70.82, 43.74, 29.05, 16.0]"
bp,0.755747
sys_len,4160
ref_len,5325


In [53]:
# backbone BLEU
bleu_metric = datasets.load_metric("sacrebleu")

bleu_metric.add(
    prediction = backbone_data['backbone_A'], reference = [backbone_data['A']]
)

results = bleu_metric.compute(smooth_method = 'floor', smooth_value=0)
results['precisions'] =[np.round(p,2) for p in results["precisions"]]

pd.DataFrame.from_dict(results, orient = "index", columns=['Value'])

  bleu_metric = datasets.load_metric("sacrebleu")


Unnamed: 0,Value
score,12.932429
counts,"[2974, 1689, 894, 218]"
totals,"[7693, 7692, 7691, 7690]"
precisions,"[38.66, 21.96, 11.62, 2.83]"
bp,1.0
sys_len,7693
ref_len,5414


### Evaluation 4. SBERT embedding consine similarity

In [41]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/227.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (1

In [54]:
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import urllib.request
from sentence_transformers import SentenceTransformer

In [55]:
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')



In [63]:
data['embedding_A'] = data.apply(lambda row: model.encode(row.A), axis = 1)

In [57]:
backbone_data['embedding_A'] = backbone_data.apply(lambda row: model.encode(row.A), axis = 1)

In [49]:
data['embedding_A']

0      [-0.00094759156, 0.5453078, 1.327793, -0.04977...
1      [-0.23273303, 0.021193853, 0.60252374, -0.1041...
2      [0.0996149, 0.004175628, 1.2044848, 0.05386993...
3      [-0.39839798, 0.3497682, 1.0311565, -0.0601119...
4      [-0.22646335, -0.116500124, 0.67873293, -0.065...
                             ...                        
446    [0.07032968, 0.07452257, 1.2256978, -0.0596912...
447    [0.013269145, 0.16201328, 1.2183927, -0.058380...
448    [-0.08044103, 0.25678623, 1.1480744, -0.141689...
449    [0.059935257, 0.0025421227, 1.3127531, -0.0310...
450    [0.06477788, 0.050976604, 0.90936667, 0.008041...
Name: embedding_A, Length: 451, dtype: object

In [50]:
data['embedding_model_a'] = data.apply(lambda row: model.encode(row.model_a), axis = 1)

In [58]:
backbone_data['embedding_model_a'] = backbone_data.apply(lambda row: model.encode(row.backbone_A), axis = 1)

In [51]:
data['embedding_model_a']

0      [-0.0043686843, 0.05743939, 1.9542196, 0.01734...
1      [-0.20516677, 0.0075466847, 0.70979804, -0.066...
2      [0.15914571, 0.07258917, 1.327937, -0.03283118...
3      [0.15696357, 0.183639, 1.9083247, -0.17198654,...
4      [-0.35512334, 0.20431882, 0.75746125, -0.01965...
                             ...                        
446    [0.07109293, -0.068285756, 1.0275587, 0.030345...
447    [0.093325965, 0.1265273, 1.1427157, -0.0297250...
448    [0.14900412, 0.06431364, 1.2363118, -0.0139227...
449    [0.108730756, 0.05379166, 1.5214031, -0.001502...
450    [0.08866086, 0.113103665, 1.0566113, -0.042482...
Name: embedding_model_a, Length: 451, dtype: object

In [59]:
def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [54]:
cos_sim = cos_sim(data['embedding_model_a'], data['embedding_A'])

In [55]:
# finetuned SBERT
np.mean(cos_sim)

0.5179476

In [61]:
# backbone SBERT
cos_sim = cos_sim(backbone_data['embedding_model_a'], backbone_data['embedding_A'])
np.mean(cos_sim)

0.4549444