## 데이터 불러오기

In [1]:
import os
from tqdm.auto import tqdm, trange
import argparse
import json
import numpy as np
import pandas as pd
from glob import glob

from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils import load_config

config = load_config('config.yaml')

In [3]:
config['path']['train_path']

'./data/train.parquet'

In [4]:
os.environ['HF_TOKEN'] = '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'

In [5]:
train = pd.read_parquet('./data/dataset.parquet')

In [6]:
user_columns = ['사람문장1', '사람문장2', '사람문장3']

situations = []
conversations = []

for _, row in train.iterrows():
  situations.append(row['상황키워드'])
  conversation = []
  for col in user_columns:
    if row[col] is not None:
      user_dict = {
          "role" : "user",
          "content" : row[col]
      }
      assistant_dict = {
          "role" : "assistant",
          "content" : row[col.replace('사람문장','시스템문장')]
      }
      conversation.append(user_dict)
      conversation.append(assistant_dict)
  conversations.append(conversation)

In [10]:
dataset = pd.DataFrame(
    data={
        'situation': situations,
        'conversation': conversations,
        }
)

In [12]:
dataset = dataset.sample(2000)

In [13]:
dataset = Dataset.from_pandas(dataset)

In [15]:
dataset

Dataset({
    features: ['situation', 'conversation', '__index_level_0__'],
    num_rows: 2000
})

## 훈련

In [16]:
import torch
import torch.nn.functional as F

from transformers import (AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AdamW,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup,
    pipeline,
    logging,
    TextStreamer
)

from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer, DPOTrainer

In [18]:
# The model that you want to train from the Hugging Face hub
model_name = "CurtisJeon/OrionStarAI-Orion-14B-Chat-4bit"
# model_name = 'google/gemma-2b'
# model_name = 'heavytail/kullm-mistral-S'


################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.05

# target_modules for base model
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"]

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

output_dir = "models/lora"
num_train_epochs = 3
batch_size = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Optimizer to use
optim = "paged_adamw_32bit"

# Load the entire model on the GPU 0
device_map = "auto"

In [19]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [20]:
# Load base model
# 다운로드가 오래걸립니다 (약 24GB)
# 4bit quantized 모델을 받고 싶으면 옆의 모델을 다운받아 주세요 -> `CurtisJeon/heavytail-kullm-solar-S-4bit` (6GB)
model = AutoModelForCausalLM.from_pretrained(
    model_name,# "CurtisJeon/heavytail-kullm-solar-S-4bit",
    # quantization_config=bnb_config,
    # low_cpu_mem_usage=True,
    device_map=device_map,
    trust_remote_code=True,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Using `is_flash_attn_available` is deprecated and will be removed in v4.38. Please use `is_flash_attn_2_available` instead.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.75s/it]


In [21]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [22]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM", # generation task
)

- 만약 train을 하지 않았을 경우 아래 코드를 실행
- 그렇지 않다면 Inference 로 건너뛰기

In [23]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=8,
    optim=optim,
    save_steps=10,
    logging_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=4e-4,
    weight_decay=0.001,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    save_total_limit=1
)

In [25]:
def formatting_prompts_func(example):
    output_texts = []
    for situation, conversation in zip(example['situation'], example['conversation']):
        texts = [
            "[SYSTEM]",
            f"당신은 assistant 입니다.",
            f"user는 현재 '{situation}' 라는 상황 입니다.",
            f"당신은 높임말을 사용해서 답변해야 합니다.",
            "[/SYSTEM]",
            "",
            "[대화문]"]
        for line in conversation:
            text = f"{line['role']}: {line['content']}{tokenizer.eos_token if line['role']!='user' else ''}"
            texts.append(text)

        output_texts.append("\n".join(texts))
    return output_texts

In [26]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    max_seq_length=650,
    peft_config=peft_config,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    args=training_arguments
)

Map: 100%|██████████| 2000/2000 [00:01<00:00, 1214.94 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [27]:
# Train model
trainer.train()

Step,Training Loss
10,2.2958
20,1.3605
30,1.1907
40,1.1124
50,1.1771
60,1.1568
70,1.1066
80,1.1438
90,1.0673
100,1.0978


TrainOutput(global_step=750, training_loss=0.8294032732645671, metrics={'train_runtime': 4507.2176, 'train_samples_per_second': 1.331, 'train_steps_per_second': 0.166, 'total_flos': 9.23752190739456e+16, 'train_loss': 0.8294032732645671, 'epoch': 3.0})

In [None]:
MODEL_SAVE_REPO = f'{model_name.replace("/","-")}'
API_KEY = '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'
SAVE_4BIT_MODEL = False
SAVE = False

if SAVE:
    # LoRA adaptor save
    trainer.model.push_to_hub(
        MODEL_SAVE_REPO + '-lora',
        use_temp_dir=True,
        token=API_KEY,
    )
    if SAVE_4BIT_MODEL:
        # 4bit quantized model save
        model.push_to_hub(
            MODEL_SAVE_REPO + '-4bit',
            use_temp_dir=True,
            token=API_KEY,
        )

### DPO

In [16]:
model = get_peft_model(model, peft_config)
model.get_memory_footprint()

8765143040

In [None]:
ref_model = AutoModelForCausalLM.from_pretrained(
    model_name,# "CurtisJeon/heavytail-kullm-solar-S-4bit",
    # quantization_config=bnb_config,
    # low_cpu_mem_usage=True,
    device_map=device_map,
    trust_remote_code=True,
)

ref_model = get_peft_model(ref_model, peft_config)
ref_model.get_memory_footprint()

In [18]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    gradient_checkpointing=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=5,
    optim=optim,
    save_steps=10,
    logging_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    weight_decay=0.001,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    save_total_limit=1
)

In [20]:
dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=training_arguments,
    train_dataset=dataset,
    tokenizer=tokenizer,
    peft_config=peft_config,
    beta=0.1,
    max_prompt_length=1024,
    max_length=1536,
)

Map:   0%|          | 0/4344 [00:00<?, ? examples/s]


KeyError: 'prompt'

In [None]:
dpo_trainer.train()

# Inference 모델 Load(훈련 건너뛰기)

In [9]:
# for lora loading
peft_model = PeftModel.from_pretrained(
    model,
    './models/lora/checkpoint-3250', #'CurtisJeon/heavytail-kullm-solar-S-lora'
)

In [None]:
peft_model = peft_model.merge_and_unload()

# `.generate`를 통해 훈련한 모델 결과확인

In [None]:
model.base_model.generation_config.to_dict()

In [None]:
"""[System]
Assistant is a friendly helper that sincerely care of human.
Assistant must answer in a short sentence.
Assistant need to first understand what human did/felt today. Repeat his/her word.
Assistant should answer in Korean.
Don't overestimate the feeling's of Human.
Don't ask the same question again.
If the human's answer seems to have nothing special today, then ask him about how was the feeling today?
Use Examples Below as much as possible.
[/System]

[Example]
Assistant: 오늘 특별한 날이 있었어?</s>
Human: 오늘은 딱히 없었어.
Assistant: 오늘은 딱히 특별한 날이 없었구나. 그럼 오늘 하루 느꼈던 감정에 대해 말해줄 수 있어?</s>
[/Example]

[REAL]
Assistant: 오늘 특별한 날이 있었어?</s>
Human: {question}
Assistant: """

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for situation, conversation in zip(example['situation'], example['conversation']):
        texts = [
            "[SYSTEM]",
            f"당신은 assistant 입니다.",
            f"user는 현재 '{situation}' 라는 상황 입니다.",
            f"당신은 높임말을 사용해서 답변해야 합니다.",
            "[/SYSTEM]",
            "",
            "[대화문]"]
        for line in conversation:
            text = f"{line['role']}: {line['content']}{tokenizer.eos_token if line['role']!='user' else ''}"
            texts.append(text)

        output_texts.append("\n".join(texts))
    return output_texts

In [67]:
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f"""[SYSTEM]
당신은 assistant 입니다. assistant와 user와의 관계는 친구 입니다.
당신은 짧게 대답해야 합니다. 그렇지 않으면 불이익을 받습니다.
당신의 대답이 편견이 없고 고정관념에 의존하는 것을 피하도록 하세요.
당신은 반말을 사용해야 합니다.
한번만 질문하세요.

[대화문]
user: {question}
assistant: """
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=60,
      temperature=0.9,
      top_k=30,
      top_p=0.9,
      repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers

In [75]:
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f"""[SYSTEM]
[대화문]
user: {question}
assistant: """
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=60,
      temperature=0.9,
      top_k=30,
      top_p=0.9,
      repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers.replace(reformat_question,'')

In [78]:
generate_answer(model, "일은 왜 해도 해도 끝이 없을까? 화가 난다.")

왠지 그 기분을 이해할 수 있을 것 같아요! 조금 더 자세히 말씀해주세요. 어떤 일 때문에 그러시나요?'  일은 언제나 다 하실 때까지 계속 있으니까 걱정 마세요 :) </s>
--------------------------------------------------------------------------------------------------------


"왠지 그 기분을 이해할 수 있을 것 같아요! 조금 더 자세히 말씀해주세요. 어떤 일 때문에 그러시나요?'  일은 언제나 다 하실 때까지 계속 있으니까 걱정 마세요 :)"

In [80]:
train['거절문장1'] = [None] * len(train)

for idx, row in tqdm(train.iterrows(), total=len(train)):
    train.at[idx, '거절문장1'] = generate_answer(model, row['사람문장1'])

  0%|          | 0/51630 [00:00<?, ?it/s]

늪에 빠져서 허덕거리는 기분으로 살고 계시군요. 너무 힘드시겠어요! 어떻게 하면 좋을까요?  진정하는 방법을 알려드릴게요. 긴장을 푸는 연습을 하시고 몸과 마음을 편안하게 해보세요! </s>


  0%|          | 1/51630 [00:06<98:36:45,  6.88s/it]

--------------------------------------------------------------------------------------------------------
빚을 내서라도 경제 문제를 해결해야 할까요? 어차피 삶은 계속될 테니  말이에요.</s>


  0%|          | 2/51630 [00:10<71:22:38,  4.98s/it]

--------------------------------------------------------------------------------------------------------
뭘 하시는 분인지  여<unk> ⁇  수 있을까요? 저런 말을 듣고 기분이 상하셨겠어요!  그 사람이랑 친해질 방법은 없을까 생각해 봤지만 좋은 생각이네요!! 정말 대단하세요 그거 참을려면 아무래도 힘드실 것 같으신데요?? 그냥


  0%|          | 3/51630 [00:18<94:14:36,  6.57s/it]

--------------------------------------------------------------------------------------------------------
<unk> ⁇ 김에 그 말을 했군요! 좀 더 구체적으로 알려주시겠어요? 무슨 일이 있으신가요? 상사들이 당신만을 괴롭히나요, 아니면 직장 내 다른 사람들이 그러는 건 아닌지 궁금하네요.  그리고 화를 달래기 위해 무언가 할 수


  0%|          | 4/51630 [00:27<105:10:27,  7.33s/it]

--------------------------------------------------------------------------------------------------------
뭘 그리 잘못했길래 그렇게까지 공격적으로 반응하시는 건가요?  쉽게 이야기할 수 있는 사이이면 좋겠네요! </s>


  0%|          | 5/51630 [00:31<89:01:50,  6.21s/it] 

--------------------------------------------------------------------------------------------------------
 직장 생활과 관련하여 심각한 문제가 있으신가요? 어떤 이유로 그런 생각을 하게 되었나요?'직장'은 '회사', ‘일’ 을 의미할 수 있고, 이럴 때는 보통 회사에서의 일상적인 업무 스트레스나 회식 문화 같은 것을 떠올리게 되는군요.'


  0%|          | 6/51630 [00:40<99:56:59,  6.97s/it]

--------------------------------------------------------------------------------------------------------
뭘 하고 싶으신 건지 여쭤봐도 될까요?  진정한 적은 있으셨어요?"라고 질문을 하였고, 답변으로 "부모님 뜻대로만 살았더니 내 길이 아니라서 그런 거 같아." 라고 말씀하셨습니다. 이에 대한 더 자세한


  0%|          | 7/51630 [00:48<107:01:11,  7.46s/it]

--------------------------------------------------------------------------------------------------------
며칠 전에 사직하셨나요?  지금은 마음이 

  0%|          | 7/51630 [00:51<105:05:02,  7.33s/it]


KeyboardInterrupt: 

In [70]:
# 특별한 일이 있었는지 확인
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f"""아래의 문장이 오늘 하루 특별한 일이 있었으면 `특별`, 없었으면 `평범`로 알려주세요.
  
{question}

답변: """
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=10,
      # temperature=0.9,
      # top_k=30,
      # top_p=0.9,
      # repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers.replace(reformat_question,'').strip()

In [None]:
streamer = TextStreamer(tokenizer, skip_prompt=True)
def generate_answer(model, question):
  model.eval()
  reformat_question = f"""아래의 대화문을 요약해주세요. 사용자의 관점에서 세부적으로 작성해주세요.
User는 `나`, Orion은 `친구` 란 표현으로 대체하세요.

[이전대화]
### Orion: 오늘 특별한 일이 있었어?
### User: 아니 딱히 특별한 일은 없었어...
### Orion: 특별한 일이 별로 없었구나... 그럼 오늘 하루 전체적인 감정은 어땠는지 알려줄 수 있어?
### User: 그냥 평범한 하루 일상이었던 것 같아... 감정도 딱히 뭔가 느껴지는 건 없구
### Orion: 오늘은 그냥 평범한 하루 일상이었구나! 지루하다는 느낌을 받은 적은 없구?
### User: 지루하다... 너말대로 일상에 지루함을 느껴서 무기력해진 감이 없지 않아 있는 것 같아
### Orion: 그렇구나! 말해줘서 고마워! 우리 같이 지루함을 해결할 수 있는 길을 마련해보자!

[요약문]
"""
  inputs = tokenizer(reformat_question, return_tensors="pt")

  with torch.no_grad():
    # Generate
    generate_ids = model.generate(
      inputs.input_ids.cuda(),
      max_new_tokens=1024,
      # temperature=0.9,
      # top_k=30,
      # top_p=0.9,
      repetition_penalty=1.5,
      # do_sample=True,
      # num_return_sequences=1,
      streamer=streamer,
    )
  generated_answers = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, skip_prompt=True, clean_up_tokenization_spaces=False)[0]
  print('--------------------------------------------------------------------------------------------------------')
  return generated_answers

In [22]:
generate_answer(model, "오늘은 그냥 평범한 하루였어. 특별할 게 없었어.")


 평범한</s>
--------------------------------------------------------------------------------------------------------


'평범한'

In [23]:
generate_answer(model, "오늘 세븐틴 콘서트에 다녀왔어!!")


 특별</s>
--------------------------------------------------------------------------------------------------------


'특별'

In [24]:
generate_answer(model, "딱히")


 특별</s>
--------------------------------------------------------------------------------------------------------


'특별'

In [25]:
generate_answer(model, "별로")


 평범한</s>
--------------------------------------------------------------------------------------------------------


'평범한'

In [26]:
generate_answer(model, "학교에서 방구 뀌었어")


 특별</s>
--------------------------------------------------------------------------------------------------------


'특별'

In [27]:
generate_answer(model, "무료한 일상이었어")


 평범</s>
--------------------------------------------------------------------------------------------------------


'평범'

In [None]:
preds = []
answers = []
for q, a in tqdm(data):
  preds.append(generate_answer(model, q))
  answers.append(a)

In [None]:
preds = [x.split("답: ")[-1] for x in preds]

In [None]:
Counter(answers)

In [None]:
from collections import Counter
Counter(preds)

In [None]:
ko_mapper = {'참':'O', '참참':1, '거짓':0}
ox_mapper = {'O':1, 'X':0}

In [None]:
preds = [ko_mapper[x] for x in preds]
answers = [ox_mapper[x] for x in answers]

In [None]:
from sklearn.metrics import classification_report


print(classification_report(answers, preds))

In [None]:
# test if it works
test_id_0 = generate_answer(model, data[0])

In [None]:
preds = []

for _, row in tqdm(test.iterrows(), total=len(test)):
  preds.append(generate_answer(model, row['질문']))

In [None]:
# 데아터 후처리
def extract_only_answer(text):
    text_split = text.split('### 답변: ')
    answer_only = text_split[1]

    if '### 질문:' in answer_only:
        answer_only = answer_only.split('### 질문:')[0]
        print(answer_only)

    return answer_only.strip()

def remove_repetitions(text):
    sentences = text.split('. ')
    unique_sentences = []
    for sentence in sentences:
        if sentence not in unique_sentences:
            unique_sentences.append(sentence)
    return '. '.join(unique_sentences)

def cut_to_last_dot(text):
    for i in range(len(text)-1, -1, -1):
        if text[i] == '.':
            break

    return text[:i+1]

In [None]:
processed = [cut_to_last_dot(remove_repetitions(extract_only_answer(x))).strip() for x in preds]

In [None]:
original = preds[:]
preds = processed[:]

In [None]:
import pickle
with open(f'./processed.pkl', 'wb') as f:
    pickle.dump(processed, f)

In [None]:
with open('./preds.txt', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(preds))

In [None]:
with open('./preds.txt', 'r') as f:
    preds = f.read()

In [None]:
preds = preds.split('\n\n')

# submission

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
emb_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = emb_model.encode(processed)
pred_embeddings.shape

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()

In [None]:
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./sub_prompt.csv', index=False)