In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from dataclasses import dataclass, field
from typing import Optional
import torch
from peft import LoraConfig
from tqdm import tqdm
import pandas as pd
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, AutoTokenizer, pipeline
from trl import SFTTrainer

from dataset_utils import load_data, label_to_num, tokenized_dataset, tokenized_dataset_xlm

tqdm.pandas()

train_data = load_data("../dataset/train/train.csv")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = train_data[:10]
train_data

Unnamed: 0,id,sentence,subject_entity,object_entity,label
0,0,〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey R...,'비틀즈','조지 해리슨',no_relation
1,1,호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으...,'민주평화당','대안신당',no_relation
2,2,K리그2에서 성적 1위를 달리고 있는 광주FC는 지난 26일 한국프로축구연맹으로부터...,'광주FC','한국프로축구연맹',org:member_of
3,3,균일가 생활용품점 (주)아성다이소(대표 박정부)는 코로나19 바이러스로 어려움을 겪...,'아성다이소','박정부',org:top_members/employees
4,4,1967년 프로 야구 드래프트 1순위로 요미우리 자이언츠에게 입단하면서 등번호는 8...,'요미우리 자이언츠','1967',no_relation
5,5,": 유엔, 유럽 의회, 북대서양 조약 기구 (NATO), 국제이주기구, 세계 보건 ...",'북대서양 조약 기구','NATO',org:alternate_names
6,6,그에 따라 나폴리와 계약을 연장한 마라도나는 1989년 팀을 UEFA컵 정상으로 인...,'AC 밀란','1989',no_relation
7,7,"박용오(朴容旿, 1937년 4월 29일(음력 3월 19일)(음력 3월 19일) ~ ...",'박용오','1937년 4월 29일',per:date_of_birth
8,8,중공군에게 온전히 대항할 수 없을 정도로 약해진 국민당은 타이베이로 수도를 옮기는 ...,'중화민국','타이베이',org:place_of_headquarters
9,9,"특히 김동연 전 경제부총리를 비롯한 김두관 국회의원, 안규백 국회의원, 김종민 국회...",'안규백','더불어민주당',per:employee_of


In [3]:
question_template = "### 질문: 다음 문장에서 주어진 두 단어의 관계를 분석하고 분류해줘. "
option_list = "보기: no_relation, org:member_of, org:top_members/employees, org:alternate_names, per:date_of_birth, org:place_of_headquarters, per:employee_of, per:origin, per:title, org:members, per:schools_attended, per:colleagues, per:alternate_names, per:spouse, org:founded_by, org:political/religious_affiliation, per:children, org:founded, org:number_of_employees/members, per:place_of_birth, org:dissolved, per:parents, per:religion, per:date_of_death, per:place_of_residence, per:other_family, org:product, per:siblings, per:product, per:place_of_death."

train_instructions = [
    f'{question_template}\n문장: "{sentence}"\n주체: "{x}"\n대상: "{y}"\n{option_list}\n\n### 보조: {label_to_num(z)}'
    for sentence, x, y, z in zip(train_data['sentence'], train_data['subject_entity'], train_data['object_entity'], train_data['label'].values)
]
ds_train = Dataset.from_dict({"text": train_instructions})
instructions_ds_dict = DatasetDict({"train": ds_train, "validation": ds_train})


model_name = "beomi/llama-2-ko-7b"


@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default=model_name, metadata={"help": "the model name"})
    dataset_text_field: Optional[str] = field(default="text", metadata={"help": "the text field of the dataset"})
    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
    learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
    batch_size: Optional[int] = field(default=1, metadata={"help": "the batch size"})
    seq_length: Optional[int] = field(default=256, metadata={"help": "Input sequence length"})
    gradient_accumulation_steps: Optional[int] = field(
        default=2, metadata={"help": "the number of gradient accumulation steps"}
    )
    load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 8 bits precision"})
    load_in_4bit: Optional[bool] = field(default=True, metadata={"help": "load the model in 4 bits precision"})
    use_peft: Optional[bool] = field(default=True, metadata={"help": "Wether to use PEFT or not to train adapters"})
    trust_remote_code: Optional[bool] = field(default=True, metadata={"help": "Enable `trust_remote_code`"})
    output_dir: Optional[str] = field(default="output", metadata={"help": "the output directory"})
    peft_lora_r: Optional[int] = field(default=64, metadata={"help": "the r parameter of the LoRA adapters"})
    peft_lora_alpha: Optional[int] = field(default=16, metadata={"help": "the alpha parameter of the LoRA adapters"})
    logging_steps: Optional[int] = field(default=1, metadata={"help": "the number of logging steps"})
    use_auth_token: Optional[bool] = field(default=False, metadata={"help": "Use HF auth token to access the model"})
    num_train_epochs: Optional[int] = field(default=3, metadata={"help": "the number of training epochs"})
    max_steps: Optional[int] = field(default=-1, metadata={"help": "the number of training steps"})
    save_steps: Optional[int] = field(
        default=100, metadata={"help": "Number of updates steps before two checkpoint saves"}
    )
    save_total_limit: Optional[int] = field(default=10, metadata={"help": "Limits total number of checkpoints."})
    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the model to HF Hub"})
    hub_model_id: Optional[str] = field(default=None, metadata={"help": "The name of the model on HF Hub"})


script_args = ScriptArguments()


if script_args.load_in_8bit and script_args.load_in_4bit:
    raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
elif script_args.load_in_8bit or script_args.load_in_4bit:
    quantization_config = BitsAndBytesConfig(
        load_in_8bit=script_args.load_in_8bit, load_in_4bit=script_args.load_in_4bit
    )
    device_map = {"": 0}
    torch_dtype = torch.float16#torch.bfloat16
else:
    device_map = None
    quantization_config = None
    torch_dtype = None

model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name,
    torch_dtype=torch_dtype,
    cache_dir='/data/ephemeral/home/tmp'
)
model.to(torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)

# tokenized_ds = tokenizer(ds_train['text'], padding=True, truncation=True, max_length=script_args.seq_length)

# dataset = tokenized_ds

# train_df, val_df = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

training_args = TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    learning_rate=script_args.learning_rate,
    logging_steps=script_args.logging_steps,
    num_train_epochs=script_args.num_train_epochs,
    max_steps=script_args.max_steps,
    report_to=script_args.log_with,
    save_steps=script_args.save_steps,
    save_total_limit=script_args.save_total_limit,
    push_to_hub=script_args.push_to_hub,
    hub_model_id=script_args.hub_model_id,
    disable_tqdm=False,
)

# if script_args.use_peft:
#     peft_config = LoraConfig(
#         r=script_args.peft_lora_r,
#         lora_alpha=script_args.peft_lora_alpha,
#         bias="none",
#         task_type="CAUSAL_LM",
#     )
# else:
peft_config = None

trainer = SFTTrainer(
    model=model,
    args=training_args,
    max_seq_length=script_args.seq_length,
    train_dataset=instructions_ds_dict['train'],
    eval_dataset=instructions_ds_dict['validation'],
    dataset_text_field=script_args.dataset_text_field,
    peft_config=peft_config,
)


#trainer.train()

#trainer.save_model(training_args.output_dir)

KeyError: 'n'

In [None]:
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,#bfloat16,
    device=0
)

In [None]:

query = instructions_ds_dict['validation']['text'][1].split('### Assistant: ')[0] + '### Assistant:'
queries = [instructions_ds_dict['validation']['text'][i].split('### Assistant: ')[0] + '### Assistant:' for i in range(len(instructions_ds_dict['validation']))]


In [None]:
tokenizer.eos_token_id

2

In [None]:
sequences = pipeline(
    queries,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=3,
    early_stopping=True,
    # do_sample=True,
)



In [None]:
results = []

for seq in sequences:
  print(seq)
  result = seq[0]['generated_text'].split('### Assistant:')[1]
  print(result)
  results.append(result)

[{'generated_text': "### Human: 다음 문장에서 두 단어를 관계를 분류해줘. \nsentence: 〈Something〉는 조지 해리슨이 쓰고 비틀즈가 1969년 앨범 《Abbey Road》에 담은 노래다.\n '비틀즈': '조지 해리슨'\n\n보기:no_relation, org:member_of, org:top_members/employees, org:alternate_names, per:date_of_birth, org:place_of_headquarters, per:employee_of, per:origin, per:title, org:members, per:schools_attended, per:colleagues, per:alternate_names, per:spouse, org:founded_by, org:political/religious_affiliation, per:children, org:founded, org:number_of_employees/members, per:place_of_birth, org:dissolved, per:parents, per:religion, per:date_of_death, per:place_of_residence, per:other_family, org:product, per:siblings, per:product, per:place_of_death\n\n### Assistant: ��"}]
 ��
[{'generated_text': "### Human: 다음 문장에서 두 단어를 관계를 분류해줘. \nsentence: 호남이 기반인 바른미래당·대안신당·민주평화당이 우여곡절 끝에 합당해 민생당(가칭)으로 재탄생한다.\n '민주평화당': '대안신당'\n\n보기:no_relation, org:member_of, org:top_members/employees, org:alternate_names, per:date_of_birth, org:place_of_headquarters, per:employ

In [None]:
labels = []

for label in instructions_ds_dict['validation']['text']:
  result = label.split('### Assistant:')[1]
  print(result)
  labels.append(result)

print("Accuracy: ", (len([1 for x, y in zip(results, labels) if y in x]) / len(labels)))

 0
 0
 20
 1
 0
 5
 0
 25
 7
 6
Accuracy:  0.0
