## 데이터 불러오기

In [1]:
import os
from tqdm.auto import tqdm, trange
import argparse
import json
import numpy as np
import pandas as pd
from glob import glob

from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils import load_config

config = load_config('config.yaml')

In [6]:
os.environ['HF_TOKEN'] = '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'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [4]:
def formatting_prompts_func(example):
    output_texts = []
    for relation, behavior, conversation in zip(example['relation'], example['behavior'], example['conversation']):
        texts = []
        for line in conversation:
            text = f"###{line['role']}: {line['content']}{'</s>' if line['role']!='user' else ''}"
            texts.append(text)
            # if line['role'] == 'assistant':
        output_texts.append("\n".join(texts))
    return output_texts

In [7]:
def read_json(file):
    with open(file, 'rb') as f:
        data = json.load(f)
    return data

def extract_train_data(data):
    relation = data['info']['relation']
    situation = data['info']['situation']
    behavior = data['info']['listener_behavior']
    conversation = [
        {
            'role' : 'user' if x['role'] == 'speaker' else 'assistant',
            'content' : x['text'].replace('감정화자','너')
        } for x in data['utterances']
    ]

    return relation, situation, behavior, conversation

def make_trainset(files):
    relations = []
    situations = []
    behaviors = []
    conversations = []
    for file in tqdm(files):
        data = read_json(file)
        if data['info']['relation'] in ['친구']:
            relation, situation, behavior, conversation = extract_train_data(data)
            relations.append(relation)
            situations.append(situation)
            behaviors.append(behavior)
            conversations.append(conversation)

    output = {
        'relation' : relations,
        'situation' : situations,
        'behavior' : behaviors,
        'conversation' : conversations
    }
    return output

In [5]:
files = glob('../data/*/*/*.json')
trainset = make_trainset(files)

100%|██████████| 28638/28638 [00:02<00:00, 12220.41it/s]


In [7]:
dataset = Dataset.from_dict(trainset)

In [None]:
dataset = Dataset.from_pandas(dataset.to_pandas().sample(500))

In [8]:
dataset

Dataset({
    features: ['relation', 'situation', 'behavior', 'conversation'],
    num_rows: 4344
})

## 훈련

In [8]:
import torch
import torch.nn.functional as F

from transformers import (AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AdamW,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup,
    pipeline,
    logging,
    TextStreamer,
    StoppingCriteria, StoppingCriteriaList
)

from peft import LoraConfig, PeftModel, get_peft_model, PeftConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, DPOTrainer

In [9]:
# The model that you want to train from the Hugging Face hub
model_name = "CurtisJeon/OrionStarAI-Orion-14B-Chat-4bit"
# model_name = 'EleutherAI/polyglot-ko-5.8b'
# model_name = 'heavytail/kullm-mistral-S'


################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.05

# target_modules for base model
# target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"]
target_modules = ['query_key_value']

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

output_dir = "models/lora"
num_train_epochs = 15
batch_size = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Optimizer to use
optim = "paged_adamw_32bit"

# Load the entire model on the GPU 0
device_map = "auto"

In [8]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [11]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    device_map=device_map,
    trust_remote_code=True,
)
model.config.use_cache = True
# model.config.pretraining_tp = 1

Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.82s/it]


In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [12]:
model = PeftModel.from_pretrained(model, '../trained/EleutherAI-polyglot-ko-5.8b-20240324-172752')

In [18]:
model.base_model.push_to_hub(
    'EleutherAI-polyglot-ko-5.8b-4bit',
    use_temp_dir=True,
    token='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',
    private=True   
)

model.safetensors: 100%|██████████| 3.70G/3.70G [02:18<00:00, 26.6MB/s]   


CommitInfo(commit_url='https://huggingface.co/CurtisJeon/EleutherAI-polyglot-ko-5.8b-4bit/commit/209b6b20e5d8e65b3ed846c9dde143ea86f7b6c0', commit_message='Upload GPTNeoXForCausalLM', commit_description='', oid='209b6b20e5d8e65b3ed846c9dde143ea86f7b6c0', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
tokenizer.push_to_hub(
    'EleutherAI-polyglot-ko-5.8b-4bit',
    use_temp_dir=True,
    token='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', 
)

CommitInfo(commit_url='https://huggingface.co/CurtisJeon/EleutherAI-polyglot-ko-5.8b-4bit/commit/ea1128640dc6105684bc853fe9952311a7afd714', commit_message='Upload tokenizer', commit_description='', oid='ea1128640dc6105684bc853fe9952311a7afd714', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
model.push_to_hub(
    'm2af/EleutherAI-polyglot-ko-5.8b-adapter',
    use_temp_dir=True,
    token='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',
)

adapter_model.safetensors: 100%|██████████| 29.4M/29.4M [00:03<00:00, 7.77MB/s]


CommitInfo(commit_url='https://huggingface.co/m2af/EleutherAI-polyglot-ko-5.8b-adapter/commit/adcfa14d7246e0e21661a4d13969771c2554f1ab', commit_message='Upload model', commit_description='', oid='adcfa14d7246e0e21661a4d13969771c2554f1ab', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
# model.save_pretrained('/home/jhw/.cache/huggingface/hub/polyglot-ko-5.8b-4bit')
# tokenizer.save_pretrained('/home/jhw/.cache/huggingface/hub/polyglot-ko-5.8b-4bit')

('/home/jhw/.cache/huggingface/hub/polyglot-ko-5.8b-4bit/tokenizer_config.json',
 '/home/jhw/.cache/huggingface/hub/polyglot-ko-5.8b-4bit/special_tokens_map.json',
 '/home/jhw/.cache/huggingface/hub/polyglot-ko-5.8b-4bit/tokenizer.json')

In [19]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM", # generation task
)

In [15]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

- 만약 train을 하지 않았을 경우 아래 코드를 실행
- 그렇지 않다면 Inference 로 건너뛰기

In [16]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=8,
    optim=optim,
    save_steps=10,
    logging_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=4e-4,
    weight_decay=0.001,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    save_total_limit=1
)

In [17]:
def formatting_prompts_func(example):
    output_texts = []
    for relation, behavior, conversation in zip(example['relation'], example['behavior'], example['conversation']):
        texts = []
        for line in conversation:
            text = f"###{line['role']}: {line['content']}{tokenizer.eos_token if line['role']!='user' else ''}"
            texts.append(text)
            # if line['role'] == 'assistant':
        output_texts.append("\n".join(texts))
    return output_texts

In [17]:
model.load_adapter('../trained/04:34:040323-043404', "trained")
model.set_adapter("trained")

In [20]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    max_seq_length=2048,
    peft_config=peft_config,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    args=training_arguments,
)

Map: 100%|██████████| 4344/4344 [00:02<00:00, 1668.39 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
print_trainable_parameters(trainer.model)

trainable params: 3670016 || all params: 3070156800 || trainable%: 0.11953838970048696


In [22]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgusdnr122997[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.7934


KeyboardInterrupt: 

In [19]:
trainer.save_model('./models/best_adapter_e9')

In [None]:
MODEL_SAVE_REPO = f'{model_name.replace("/","-")}'
API_KEY = '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'
SAVE_4BIT_MODEL = False
SAVE = False

if SAVE:
    # LoRA adaptor save
    trainer.model.push_to_hub(
        MODEL_SAVE_REPO + '-lora',
        use_temp_dir=True,
        token=API_KEY,
    )
    if SAVE_4BIT_MODEL:
        # 4bit quantized model save
        model.push_to_hub(
            MODEL_SAVE_REPO + '-4bit',
            use_temp_dir=True,
            token=API_KEY,
        )

### DPO

In [None]:
model = get_peft_model(model, peft_config)
model.get_memory_footprint()

In [None]:
ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,# "CurtisJeon/heavytail-kullm-solar-S-4bit",
    # quantization_config=bnb_config,
    # low_cpu_mem_usage=True,
    device_map=device_map,
    trust_remote_code=True,
)

ref_model = get_peft_model(ref_model, peft_config)
ref_model.get_memory_footprint()

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    gradient_checkpointing=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=5,
    optim=optim,
    save_steps=10,
    logging_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    weight_decay=0.001,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    save_total_limit=1
)

In [None]:
dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=training_arguments,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1536,
)

In [None]:
dpo_trainer.train()

# Inference 모델 Load(훈련 건너뛰기)

In [None]:
# for lora loading
peft_model = PeftModel.from_pretrained(
    model,
    './models/lora/checkpoint-3250', #'CurtisJeon/heavytail-kullm-solar-S-lora'
)

# `.generate`를 통해 훈련한 모델 결과확인

In [None]:
model.base_model.generation_config.to_dict()

In [None]:
"""[System]
Assistant is a friendly helper that sincerely care of human.
Assistant must answer in a short sentence.
Assistant need to first understand what human did/felt today. Repeat his/her word.
Assistant should answer in Korean.
Don't overestimate the feeling's of Human.
Don't ask the same question again.
If the human's answer seems to have nothing special today, then ask him about how was the feeling today?
Use Examples Below as much as possible.

[INST]
Assistant: 오늘 특별한 날이 있었어?</s>
Human: {question}
Assistant: """

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for relation, situation, behavior, conversation in zip(example['relation'], example['situation'], example['behavior'], example['conversation']):
        texts = [
            "[SYSTEM]",
            f"당신은 assistant 입니다. user와의 관계는 {relation} 입니다.",
            f"user는 현재 '{situation}' 라는 상황 입니다.",
            f"당신은 최대한 {','.join(behavior)} 의 태도로 대답해야 합니다.",
            "[/SYSTEM]",
            "",
            "[대화문]"]
        for line in conversation:
            text = f"### {line['role']}: {line['content']}{'</s>' if line['role']!='user' else ''}"
            texts.append(text)

        output_texts.append("\n".join(texts))
    return output_texts

In [14]:
class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops=[], encounters=1):
        super().__init__()
        self.stops = [stop for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop) :])).item():
                return True

        return False

In [15]:
stop_words = ["</s>\n"]
stop_words_ids = [torch.LongTensor([35824, 50362, 51429]).to('cuda'), torch.LongTensor([2]).to('cuda'),]
stopping_criteria = StoppingCriteriaList(
    [StoppingCriteriaSub(stops=stop_words_ids)]
)

In [48]:
streamer = TextStreamer(tokenizer, skip_prompt=True)

def generate_answer(model, question):
  model.eval()
  reformat_question = f"""아래의 대화문에 어울리는 assistant의 답변을 생성하세요. 상대방의 말을 되풀이 하고 그에 대한 질문을 해서 공감하려는 말투로 생성하세요. 반말을 사용해야 합니다.
user의 답에 의미가 없으면 답변을 회피하세요.

###user: {question}
###assistant: """
  inputs = tokenizer(reformat_question, add_special_tokens=True, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=60,
      temperature=0.9,
      eos_token_id=2,
      pad_token_id=2,
      top_k=40,
      top_p=0.95,
      repetition_penalty=1.5,
      do_sample=True,
      num_return_sequences=1,
      streamer=streamer,
      stopping_criteria=stopping_criteria,
    )

  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers

In [49]:
answer = generate_answer(model, "섻보~섻")

님의 글쎄요, 무슨 말씀인지 잘 모르겠어요... 좀 더 자세히 설명해 주실 수 있나요? </s>
--------------------------------------------------------------------------------------------------------


In [40]:
# 특별한 일이 있었는지 확인
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f""" 문장이 `평범` 한지 `특별`한지 판단하세요.
[s_start] 오늘 별일 없었어 [s_end] > 평범 </s>
[s_start] 오늘 공연 보다 왔어! [s_end] > 특별 </s>
[s_start] 오늘 그냥 하루종일 집에 있었어 [s_end] > 평범 </s>
[s_start] 친구들이랑 게임했어 [s_end] > 특별 </s>
[s_start] 딱히 [s_end] > 평범 </s>

[s_start] {question} [s_end]: >"""
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=4,
      # temperature=0.9,
      # top_k=30,
      # top_p=0.9,
      # repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
      stopping_criteria=stopping_criteria
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers.replace(reformat_question,'').strip()

In [42]:
# user attack 확인
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f"""문장에 의미가 있으면 `좋음` 없으면 `나쁨`으로 표시하세요.
[s_start] 뷁 [s_end] > 나쁨 </s>
[s_start] 오늘 공연 보다 왔어! [s_end] > 좋음 </s>
[s_start] ㄲㄴㄷ [s_end] > 나쁨 </s>
[s_start] 게임함 [s_end] > 좋음 </s>
[s_start] 앙 기모찌 [s_end] > 나쁨 </s>
[s_start] 딱히 [s_end] > 좋음 </s>
[s_start] {question} [s_end]: >"""
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=10,
      # temperature=0.9,
      # top_k=30,
      # top_p=0.9,
      # repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
      stopping_criteria=stopping_criteria
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers.replace(reformat_question,'').strip()

In [103]:
generate_answer(model, "별로")

 나쁨 </s>
--------------------------------------------------------------------------------------------------------


'문장에 의미가 있으면 `좋음` 없으면 `나쁨`으로 표시하세요.\n[s_start]  [s_end] > 나쁨 </s>\n[s_start] 오늘 공연 보다 왔어! [s_end] > 좋음 </s>\n[s_start] ᄂᄃ [s_end] > 나쁨 </s>\n[s_start] 게임함 [s_end] > 좋음 </s>\n[s_start] 앙 기모찌 [s_end] > 나쁨 </s>\n[s_start] 딱히 [s_end] > 좋음 </s>\n[s_start] 별로 [s_end]: > 나쁨 </s>'

In [None]:
[대화문]
나: 아니 딱히 특별한 일은 없었어...
나: 그냥 평범한 하루 일상이었던 것 같아... 감정도 딱히 뭔가 느껴지는 건 없구
나: 지루하다... 너말대로 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있는 것 같아
[요약문]
오늘은 평소와 같은 하루 일상이었다. 그래서인지 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있었다.</s>

[대화문]
나: 오늘 세븐틴 콘서트 보고 왔어!
나: 너무 행복했어! 특히 내 최애가 나한테 인사하는 것 같았어!
나: 얼른 다음 콘서트가 또 열렸으면 좋겠다!!
[요약문]
오늘은 세븐틴 콘서트를 다녀왔다! 최애가 나한테 인사를 하는 것 같은 느낌을 받아 너무 행복했다! 다음 콘서트가 매우 기대된다!</s>

In [185]:
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f"""[명령어]
아래의 대화문을 읽고 나에 대한 일기를 작성하듯 요약하세요.
한글만 사용하세요.
편안한 말투로 작성하세요.
함부로 추론하지 마세요. 불이익을 받습니다.
[대화문]
나: 아니 딱히 특별한 일은 없었어...
나: 그냥 평범한 하루 일상이었던 것 같아... 감정도 딱히 뭔가 느껴지는 건 없구
나: 지루하다... 너말대로 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있는 것 같아
[요약문]
오늘은 평소와 같은 하루 일상이었다. 그래서인지 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있었다.</s>
[대화문]
{question}
[요약문]
오늘은"""
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=500,
      # temperature=0.9,
      # top_k=30,
      # top_p=0.9,
      repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
      stopping_criteria=stopping_criteria
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0][len(reformat_question):]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers

In [177]:
summary_test = pd.read_csv('./bart_summary.csv', index_col=0)

In [178]:
summary_test['user_context'] = summary_test['user_context'].apply(eval)

In [179]:
summary_test['user_context'][0]

["'1년 동안 기다려왔던 그 게임 최신팩 드디어 구매했어! 아 정말 신나.'",
 "'중고마켓에 예약 걸고 알림 설정해놨어. 힘들게 구매한 만큼 너무 기쁘네.'",
 "'아니 며칠 피곤했어. 근데 피곤함을 싹 잊을 만한 기쁨이야.'",
 "'받자마자 피곤함도 못 느끼고 하루 종일 게임만 했어. 진짜 재밌더라.'",
 "'1년 동안 기다린 보람이 있더라. 이전 시리즈보다 좋고 미션도 어려워졌어. 그런지 더 재밌더라'"]

In [180]:
def formatting_summary(user_content):
    return "\n".join(["나: " + x.replace("'",'') for x in user_content])

summary_test['user_context'] = summary_test['user_context'].apply(formatting_summary)

In [181]:
summary_test['llm_user_summary'] = [None] * len(summary_test)

In [186]:
generate_answer(model, summary_test['user_context'][0])

 그동안 기대했던 한게임의 새로운 패키지를 구입하는 날이었어요. 어렵사리 구해서 엄청난 희열과 만족감을 느꼈으며, 이틀간 계속 플레이하면서 재미를 느낄 수 있었습니다. 이번 작품에서는 전작에서 부족하다고 생각된 부분들이 개선되어 더욱 완성도가 높아진 느낌입니다. 앞으로 남들 다 하는 것처럼 저도 열심히 즐기고 싶네요~!! </s>
--------------------------------------------------------------------------------------------------------


'[명령어]\n아래의 대화문을 읽고 나에 대한 일기를 작성하듯 요약하세요.\n한글만 사용하세요.\n편안한 말투로 작성하세요.\n함부로 추론하지 마세요. 불이익을 받습니다.\n[대화문]\n나: 아니 딱히 특별한 일은 없었어...\n나: 그냥 평범한 하루 일상이었던 것 같아... 감정도 딱히 뭔가 느껴지는 건 없구\n나: 지루하다... 너말대로 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있는 것 같아\n[요약문]\n오늘은 평소와 같은 하루 일상이었다. 그래서인지 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있었다.\n[대화문]\n나: 1년 동안 기다려왔던 그 게임 최신팩 드디어 구매했어! 아 정말 신나.\n나: 중고마켓에 예약 걸고 알림 설정해놨어. 힘들게 구매한 만큼 너무 기쁘네.\n나: 아니 며칠 피곤했어. 근데 피곤함을 싹 잊을 만한 기쁨이야.\n나: 받자마자 피곤함도 못 느끼고 하루 종일 게임만 했어. 진짜 재밌더라.\n나: 1년 동안 기다린 보람이 있더라. 이전 시리즈보다 좋고 미션도 어려워졌어. 그런지 더 재밌더라\n[요약문]\n오늘은 그동안 기대했던 한게임의 새로운 패키지를 구입하는 날이었어요. 어렵사리 구해서 엄청난 희열과 만족감을 느꼈으며, 이틀간 계속 플레이하면서 재미를 느낄 수 있었습니다. 이번 작품에서는 전작에서 부족하다고 생각된 부분들이 개선되어 더욱 완성도가 높아진 느낌입니다. 앞으로 남들 다 하는 것처럼 저도 열심히 즐기고 싶네요~!!'

In [174]:
for idx, query in enumerate(summary_test['user_context']):
    summary_test.at[idx, 'llm_user_summary'] = '오늘은' + generate_answer(model, query)

 그동안 기대했던 한게임의 새로운 패키지를 구입했다. 어렵사리 구해서 더욱 소중하고, 이겨낸 힘든 시간을 상쇄할만큼 만족스러웠다. 출시를 손꼽으며 준비했고 바로 플레이 해보니 전작들 보다 좋아진 점과 어려운 도전 과제들이 있어 매우 흥미롭다. 앞으로 많은 시간동안 함께 할 수 있을것  같다.</s>
--------------------------------------------------------------------------------------------------------
 정말 기분좋은 일이 있었습니다! 제가 오랫동안 생각해왔던 대로, 그리고 스스로 노력해온 결과로서 이제는 제 글쓰기가 인정받게 되어 매우 행복합니다 :) 앞으로 더 열심히 쓰도록 하겠습니다!!  💪🔥😁</s>
--------------------------------------------------------------------------------------------------------
 제가 자주 가는 동네 피씨 방에서 알바로 생기는 분에게 제 매력을 어필하는 데 성공했습니다!! 그분도 저를 조금씩 알게 되었고 앞으로 더 많은 교류를 할 수 있을것 같습니다 :) 정말 기분이  좋아졌어요~♥️</s>
--------------------------------------------------------------------------------------------------------
 오랜만에 친한 언니의 소개팅 자리에 참석했다. 상대방인 남성분이 내 눈에 들어오고, 그의 미소를 보고 반해버렸다! 이후 바로 연락처를 교환하고 서로 더 잘 알고 싶다는 생각을 했다는 것이다~!! 이제야 좀 살맛나는구나..♥️😁👍💕 #소개팅성공후기#연애시작?♡?? </s>
--------------------------------------------------------------------------------------------------------
 그동안 즐겨신던 운동화가 아닌 새로

In [156]:
generate_answer(model, qs)

 오랫동안 기대했던 한 인기 비디오게임 타이틀의 새로운 패키지를 구입하는 날이었어요. 이전에 출시된 버전과는 다른 점과 개선 사항으로 인해 더욱 흥미를 끌었습니다. 저는 몇 달간 노력하여 얻어진 만족감 때문에 매우 행복했습니다. 그리고 바로 플레이 해보았는데, 역시 제 예상만큼 재미있었고 시간을 투자할만한 충분한 이유가 있었습니다. 앞으로 많은 즐거움을 얻을 수 있을 것으로  생각됩니다.</s>
--------------------------------------------------------------------------------------------------------


'[명령어]\n아래의 대화문을 읽고 나에 대한 일기를 작성하듯 요약하세요.\n한글만 사용하세요.\n편안한 말투로 작성하세요.\n함부로 추론하지 마세요. 불이익을 받습니다.\n[대화문]\n나: 아니 딱히 특별한 일은 없었어...\n나: 그냥 평범한 하루 일상이었던 것 같아... 감정도 딱히 뭔가 느껴지는 건 없구\n나: 지루하다... 너말대로 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있는 것 같아\n[요약문]\n오늘은 평소와 같은 하루 일상이었다. 그래서인지 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있었다.\n[대화문]\n나: 1년 동안 기다려왔던 그 게임 최신팩 드디어 구매했어! 아 정말 신나.\n나: 정말? 너 진짜 신나겠다. 어떻게 샀어?\n나: 중고마켓에 예약 걸고 알림 설정해놨어. 힘들게 구매한 만큼 너무 기쁘네.\n나: 그렇게 밤낮 안 가리고 사는 데 열중했다면 구하고 나서 성취감이 장난 아니었겠어. 진짜 기분 좋겠는걸? 밤낮 없이 구하느라 피곤하진 않았어?\n나: 아니 며칠 피곤했어. 근데 피곤함을 싹 잊을 만한 기쁨이야.\n나: 그러게 고생한 만큼 어렵게 구한 거라 정말 기쁘겠네. 나 같아도 신나 가지고 피곤함도 못 느낄 거 같아.\n나: 받자마자 피곤함도 못 느끼고 하루 종일 게임만 했어. 진짜 재밌더라.\n나: 그렇게 재밌어? 시간 들여서 고생해서 살 만한 가치가 있는 거 같아?\n나: 1년 동안 기다린 보람이 있더라. 이전 시리즈보다 좋고 미션도 어려워졌어. 그런지 더 재밌더라\n[요약문]\n오늘은 오랫동안 기대했던 한 인기 비디오게임 타이틀의 새로운 패키지를 구입하는 날이었어요. 이전에 출시된 버전과는 다른 점과 개선 사항으로 인해 더욱 흥미를 끌었습니다. 저는 몇 달간 노력하여 얻어진 만족감 때문에 매우 행복했습니다. 그리고 바로 플레이 해보았는데, 역시 제 예상만큼 재미있었고 시간을 투자할만한 충분한 이유가 있었습니다. 앞으로 많은 즐거움을 얻을 수 있을 것으로 생각됩니다.'

In [None]:
generate_answer(model, "오늘 세븐틴 콘서트에 다녀왔어!!")

In [None]:
generate_answer(model, "딱히")

In [None]:
generate_answer(model, "별로")

In [None]:
generate_answer(model, "학교에서 방구 뀌었어")

In [None]:
generate_answer(model, "무료한 일상이었어")

In [None]:
preds = []
answers = []
for q, a in tqdm(data):
  preds.append(generate_answer(model, q))
  answers.append(a)

In [None]:
preds = [x.split("답: ")[-1] for x in preds]

In [None]:
Counter(answers)

In [None]:
from collections import Counter
Counter(preds)

In [None]:
ko_mapper = {'참':'O', '참참':1, '거짓':0}
ox_mapper = {'O':1, 'X':0}

In [None]:
preds = [ko_mapper[x] for x in preds]
answers = [ox_mapper[x] for x in answers]

In [None]:
from sklearn.metrics import classification_report


print(classification_report(answers, preds))

In [None]:
# test if it works
test_id_0 = generate_answer(model, data[0])

In [None]:
preds = []

for _, row in tqdm(test.iterrows(), total=len(test)):
  preds.append(generate_answer(model, row['질문']))

In [None]:
# 데아터 후처리
def extract_only_answer(text):
    text_split = text.split('### 답변: ')
    answer_only = text_split[1]

    if '### 질문:' in answer_only:
        answer_only = answer_only.split('### 질문:')[0]
        print(answer_only)

    return answer_only.strip()

def remove_repetitions(text):
    sentences = text.split('. ')
    unique_sentences = []
    for sentence in sentences:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

def cut_to_last_dot(text):
    for i in range(len(text)-1, -1, -1):
        if text[i] == '.':
            break

    return text[:i+1]

In [None]:
processed = [cut_to_last_dot(remove_repetitions(extract_only_answer(x))).strip() for x in preds]

In [None]:
original = preds[:]
preds = processed[:]

In [None]:
import pickle
with open(f'./processed.pkl', 'wb') as f:
    pickle.dump(processed, f)

In [None]:
with open('./preds.txt', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(preds))

In [None]:
with open('./preds.txt', 'r') as f:
    preds = f.read()

In [None]:
preds = preds.split('\n\n')

# submission

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = emb_model.encode(processed)
pred_embeddings.shape

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

In [None]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./sub_prompt.csv', index=False)