In [1]:
from tqdm import tqdm
import pandas as pd
import sys
import time
import yaml
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

from utils.prompter import Prompter

def inference(MODEL, final_prompt, pipe=False):
    # 데이터 불러온 뒤 멀티 프롬프트 적용
    df = pd.read_csv('../notebooks/data/test_multi.csv')
    new_df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

    prompter = Prompter('multi')
    lst = []
    for i in new_df['instruction']:
        txt = prompter.generate_prompt(i)
        lst.append(txt)

    df = pd.DataFrame(lst, columns = ['input'])
    messages = df['input'].to_list()

    # 모델 불러온 후 프롬프트화 된 질문에 대한 응답 생성
    model = AutoModelForCausalLM.from_pretrained(
        MODEL,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    ).to(device=f"cuda", non_blocking=True)
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model.eval()

    def gen(x):
        inputs = tokenizer(
            x, 
            return_tensors='pt',
            return_token_type_ids=False
        )
        inputs = {name: tensor.to(model.device) for name, tensor in inputs.items()} 
        gened = model.generate(
            **inputs, 
            max_new_tokens=512,
            early_stopping=True,
            do_sample=True,
            eos_token_id=2,
        )
        gened_list = gened[0].tolist()
        try:
            eos_token_position = gened_list.index(2)
            gened_list = gened_list[:eos_token_position]
        except ValueError:
            pass
        
        return tokenizer.decode(gened_list)

    lst = []
    if pipe:
        generator = pipeline("text-generation", model=model, tokenizer = tokenizer, device=0) 
        for message in tqdm(messages):
            # generate response
            res = generator(message, max_length=512, do_sample=True, eos_token_id=2)
            output = res[0]['generated_text']
            lst.append(output)
            print('##################################################################')
    else:
        for message in tqdm(messages):
            lst.append(gen(message))
            print('##################################################################')
    print('응답 생성 끝!!')

    # 생성된 응답과 질문을 함께 GPT-4에 복사-붙여넣기 할 프롬프트에 넣고 저장
    final_lst = []
    for res in lst:
        instruction = '\n'.join(res.split('\n')[2:12])
        response = res.split('### 응답:')[-1].strip()

        a = final_prompt.format(instruction=instruction, response=response)
        final_lst.append(a)
    
    df = pd.DataFrame(final_lst, columns = ['GPT4_prompt'])

    return df

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
final_prompt = """두 사람 간의 대화가 주어집니다. 다음의 지시문(Instruction)을 받게 될 것입니다. 지시문은 이전 대화내용을 포함하며 현재 대화에 대한 응답(Response)이 제시됩니다.
당신의 작업은 응답을 평가 단계에 따라 응답을 평가하는 것입니다. 
이 평가 기준을 꼼꼼히 읽고 이해하는 것이 중요합니다. 평가하는 동안 이 문서를 계속 열어두고 필요할 때 참조해 주세요. 

평가 기준:
- 이해 가능성 (0 - 1): Instruction에 기반하여 Response를 이해 할 수 있나요?
- 자연스러움 (1 - 5): 사람이 자연스럽게 말할 법한 Instruction 인가요?
- 맥락 유지 (1 - 5): Instruction을 고려했을 때 Response가 맥락을 유지하나요?
- 흥미롭기 (1 - 5): Response가 지루한가요, 아니면 흥미로운가요?
- Instruction 사용 (0 - 1): Instruction에 기반하여 Response를 생성 했나요?
- 공감 능력 (0 - 1): Response에 Instruction의 내용에 기반한 공감의 내용이 있나요?
- 대화 유도 (0 - 1): Response에 질문을 포함하여 사용자의 대답을 자연스럽게 유도하고 있나요?
- 전반적인 품질 (1 - 10): 위의 답변을 바탕으로 이 발언의 전반적인 품질에 대한 인상은 어떤가요?

평가 단계:
1. Instruction, 그리고 Response을 주의깊게 읽습니다.
2. 위의 평가 기준에 따라 Response을 엄격하게 평가합니다.

Instruction:
{instruction}

Response:
{response}


Result
- 이해 가능성 (0 - 1):
- 자연스러움 (1 - 5):
- 맥락 유지 (1 - 5):
- 흥미롭기 (1 - 5):
- Instruction 사용 (0 - 1):
- 공감 능력 (0 - 1)
- 대화 유도 (0 - 1): 
- 전반적인 품질 (1 - 10):"""

### 1. 멀티 KULLM

In [36]:
result = inference('nlpai-lab/kullm-polyglot-12.8b-v2', final_prompt)

Loading checkpoint shards: 100%|██████████| 3/3 [00:17<00:00,  5.67s/it]
  0%|          | 0/50 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▏         | 1/50 [00:03<02:55,  3.59s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


  4%|▍         | 2/50 [00:12<05:22,  6.72s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


  6%|▌         | 3/50 [00:18<05:01,  6.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


  8%|▊         | 4/50 [00:25<05:00,  6.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 10%|█         | 5/50 [00:28<03:58,  5.29s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 12%|█▏        | 6/50 [00:31<03:21,  4.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 14%|█▍        | 7/50 [00:36<03:24,  4.75s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 16%|█▌        | 8/50 [00:41<03:16,  4.69s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 18%|█▊        | 9/50 [00:46<03:24,  4.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 20%|██        | 10/50 [00:54<03:50,  5.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 22%|██▏       | 11/50 [00:59<03:39,  5.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 24%|██▍       | 12/50 [01:01<02:54,  4.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 26%|██▌       | 13/50 [01:03<02:19,  3.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 28%|██▊       | 14/50 [01:07<02:16,  3.80s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 30%|███       | 15/50 [01:10<02:05,  3.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 32%|███▏      | 16/50 [01:17<02:36,  4.61s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 34%|███▍      | 17/50 [01:20<02:12,  4.03s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 36%|███▌      | 18/50 [01:27<02:36,  4.88s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 38%|███▊      | 19/50 [01:33<02:43,  5.29s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 40%|████      | 20/50 [01:37<02:25,  4.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 42%|████▏     | 21/50 [01:44<02:41,  5.58s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 44%|████▍     | 22/50 [01:53<03:03,  6.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 46%|████▌     | 23/50 [02:00<03:04,  6.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 48%|████▊     | 24/50 [02:06<02:45,  6.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 50%|█████     | 25/50 [02:11<02:31,  6.08s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 52%|█████▏    | 26/50 [02:14<02:00,  5.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 54%|█████▍    | 27/50 [02:24<02:30,  6.55s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 56%|█████▌    | 28/50 [02:30<02:25,  6.61s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 58%|█████▊    | 29/50 [02:32<01:47,  5.11s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 60%|██████    | 30/50 [02:37<01:43,  5.16s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 62%|██████▏   | 31/50 [02:42<01:33,  4.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 64%|██████▍   | 32/50 [02:48<01:38,  5.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 66%|██████▌   | 33/50 [02:55<01:37,  5.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 68%|██████▊   | 34/50 [03:00<01:27,  5.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 70%|███████   | 35/50 [03:08<01:36,  6.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 72%|███████▏  | 36/50 [03:13<01:22,  5.90s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 74%|███████▍  | 37/50 [03:20<01:19,  6.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 76%|███████▌  | 38/50 [03:25<01:11,  6.00s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 78%|███████▊  | 39/50 [03:29<00:59,  5.40s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 80%|████████  | 40/50 [03:37<01:00,  6.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 82%|████████▏ | 41/50 [03:41<00:49,  5.46s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 84%|████████▍ | 42/50 [03:47<00:44,  5.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 86%|████████▌ | 43/50 [04:18<01:33, 13.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 88%|████████▊ | 44/50 [04:22<01:03, 10.53s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 90%|█████████ | 45/50 [04:23<00:38,  7.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 92%|█████████▏| 46/50 [04:30<00:29,  7.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 94%|█████████▍| 47/50 [04:33<00:18,  6.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 96%|█████████▌| 48/50 [04:43<00:14,  7.18s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


 98%|█████████▊| 49/50 [04:47<00:06,  6.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


##################################################################


100%|██████████| 50/50 [04:53<00:00,  5.88s/it]

##################################################################
응답 생성 끝!!





In [40]:
result.to_csv('../notebooks/evaluate_datas/KULLM_eval.csv')

In [2]:
import pandas as pd

koalpaca_after = pd.read_csv('../notebooks/evaluate_datas/Koalpaca_after.csv')

In [17]:
float(koalpaca_after['output'][0].split('\n')[5].split(':')[1])

1.0

### 2. 멀티 Koalpaca

In [None]:
result = inference('beomi/KoAlpaca-Polyglot-12.8B', final_prompt)

In [5]:
result.to_csv('../notebooks/evaluate_data/Koalpaca_eval.csv')

In [22]:
df = pd.read_csv('../notebooks/evaluate_datas/Koalpaca_after.csv')

In [None]:
import re

def get_scores(eval_result: str) -> list:
    score_list = re.findall(r': (\d+\.?\d*)', eval_result)
    return list(map(float, score_list))
column_names = [
    'understandable',
    'natural',
    'maintains_context',
    'interesting',
    'uses_knowledge',
    'empathy',
    'conversational',
    'overall_quality'
]
df[column_names] = pd.DataFrame(df['output'].apply(get_scores).to_list(), index=df.index)
df

In [27]:
avg_scores = df[column_names].mean()
avg_scores

understandable       0.788
natural              3.810
maintains_context    3.800
interesting          3.020
uses_knowledge       0.750
empathy              0.720
conversational       0.500
overall_quality      6.200
dtype: float64

In [28]:
df.to_csv('../notebooks/evaluate_datas/Koalpaca.csv', index=False)

### 3. KoVicuna


In [1]:
from tqdm import tqdm
import pandas as pd
import sys
import time
import yaml
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

from utils.prompter import Prompter

def inference_kovicuna(final_prompt):
    # 데이터 불러온 뒤 멀티 프롬프트 적용
    multi_df = pd.read_csv('../notebooks/evaluate_datas/test_multi.csv')
    single_df = pd.read_csv('../notebooks/evaluate_datas/test_single.csv')

    # 멀티-턴 데이터 입력을 위한 프롬프트 붙이기(정보 명시적으로 추가해주기)
    prompter = Prompter('multi')
    lst = []
    for i in multi_df['instruction']:
        txt = prompter.generate_prompt(i)
        lst.append(txt)

    multi_inputs = pd.DataFrame(lst, columns = ['input'])['input'].to_list()
    single_inputs = single_df['instruction'].to_list()

    print('문장 준비 완료!')

    # 모델 불러온 후 질문에 대한 응답 생성
    print('모델 불러오는 중....')
    model = AutoModelForCausalLM.from_pretrained("junelee/ko_vicuna_7b")
    tokenizer = AutoTokenizer.from_pretrained("junelee/ko_vicuna_7b", unk_token="<unk>",bos_token="<s>", eos_token="</s>")
    model.eval()
    generator = pipeline("text-generation", model=model, tokenizer = tokenizer, device=0, max_new_tokens = 512)
    print('모델 준비 완료!!')

    
    print('멀티-턴 생성 시작')
    lst = []
    for message in tqdm(multi_inputs):
        res = generator(message, do_sample=True, eos_token_id=2)
        output = res[0]['generated_text']
        lst.append(output)
        print('##################################################################')
    print('멀티-턴 응답 생성 끝!!')

    # 생성된 응답과 질문을 함께 GPT-4에 복사-붙여넣기 할 프롬프트에 넣고 저장
    final_lst = []
    for i, res in enumerate(lst):
        instruction = multi_df.loc[i, 'instruction']
        response = res.split('### 응답:')[-1].strip()

        a = final_prompt.format(instruction=instruction, response=response)
        final_lst.append(a)
    
    multi = pd.DataFrame(final_lst, columns = ['GPT4_prompt'])


    print('싱글-턴 생성 시작')
    lst = []
    for message in tqdm(single_inputs):
        res = generator(message, do_sample=True, eos_token_id=2)
        output = res[0]['generated_text']
        lst.append(output)
        print('##################################################################')
    print('싱글-턴 응답 생성 끝!!')

    # 생성된 응답과 질문을 함께 GPT-4에 복사-붙여넣기 할 프롬프트에 넣고 저장
    final_lst = []
    for i, res in enumerate(lst):
        instruction = single_df.loc[i, 'instruction']
        response = res.split('### 응답:')[-1].strip()

        a = final_prompt.format(instruction=instruction, response=response)
        final_lst.append(a)
    
    single = pd.DataFrame(final_lst, columns = ['GPT4_prompt'])

    return multi, single

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
multi, single = inference_kovicuna(final_prompt)

In [17]:
multi.to_csv('../notebooks/evaluate_datas/Kovicuna_multi_eval.csv')
single.to_csv('../notebooks/evaluate_datas/Kovicuna_single_eval.csv')

### 최종 모델 확인

In [20]:
import pandas as pd

final_multi = pd.read_csv('../notebooks/evaluate_datas/eval_0727_multi.csv', encoding='cp949')
final_single = pd.read_csv('../notebooks/evaluate_datas/eval_0727_single.csv', encoding='cp949')

In [22]:
# 멀티
import re

def get_scores(eval_result: str) -> list:
    score_list = re.findall(r': (\d+\.?\d*)', eval_result)
    return list(map(float, score_list))

column_names = [
    'understandable',
    'natural',
    'maintains_context',
    'interesting',
    'uses_knowledge',
    'empathy',
    'conversational',
    'overall_quality'
]
final_single[column_names] = pd.DataFrame(final_single['output'].apply(get_scores).to_list(), index=final_single.index)

avg_scores = final_single[column_names].mean()
avg_scores

understandable       0.97
natural              4.34
maintains_context    4.82
interesting          2.76
uses_knowledge       0.97
empathy              0.91
conversational       0.82
overall_quality      7.30
dtype: float64

In [None]:
final_multi

In [23]:
final_multi.to_csv('../notebooks/evaluate_datas/final_multi.csv', index=False)
final_single.to_csv('../notebooks/evaluate_datas/final_single.csv', index=False)