In [1]:
import wandb
wandb.init(mode="disabled")


import json
import random
import torch
from torch.utils.data import Dataset

import os
import copy
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BitsAndBytesConfig,
    LlamaPreTrainedModel,
    LlamaModel,
    AutoTokenizer,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

In [2]:
class Llama3ForSFT(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.post_init()

    def forward(
        self,
        input_ids= None,
        attention_mask= None,
        position_ids = None,
        past_key_values= None,
        inputs_embeds= None,
        labels= None,
        use_cache= None,
        output_attentions= None,
        output_hidden_states = None,
        return_dict= None,
        cache_position = None,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)

            label_tokens_ids = torch.tensor(LABEL_IDS,device=shift_labels.device)
            index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
            true_labels = shift_labels[torch.isin(shift_labels, label_tokens_ids)]
            true_labels = torch.tensor([index_mapping[label.item()] for label in true_labels], device=true_labels.device)
            true_logits = shift_logits[torch.isin(shift_labels, label_tokens_ids)][:,label_tokens_ids]
            loss = loss_fct(true_logits, true_labels)

        return CausalLMOutputWithPast(
            loss=loss,
            logits=true_logits,
        )

In [3]:
import argparse
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from accelerate import Accelerator
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
import numpy as np


import wandb

def str_to_bool(value):
    if value.lower() in ('true', '1'):
        return True
    elif value.lower() in ('false', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError(f"Boolean value expected, got {value}")

from sklearn.metrics import f1_score


def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    label_tokens_ids = np.array(LABEL_IDS)
    index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
    labels = labels[np.isin(labels, label_tokens_ids)]
    labels = np.array([index_mapping[label.item()] for label in labels])
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [4]:
class TrainDataset(Dataset):
    def __init__(self, fname, tokenizer, k=10,fold_idx=0, mask_prob=0.):
        self.IGNORE_INDEX = -100
        self.data = []
        self.tokenizer = tokenizer
        self.mask_prob = mask_prob

        self.PROMPT = '''You are an AI assistant that helps users analyze conversations and solve related problems. Please read the conversation carefully and select the most appropriate answer to the question based on the given options.'''
        self.answer_dict = {
            "inference_1": 0,
            "inference_2": 1,
            "inference_3": 2
        }

        
        with open(fname, "r", encoding='utf-8') as f:
            self.data = json.load(f)

        fold_size = len(self.data) // k
        start = fold_size*fold_idx
        end = start + fold_size
        self.data = self.data[:start] + self.data[end:]
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data[idx]
        inp = example["input"]
        chat = ["[Conversation]"]

        for cvt in inp['conversation']:
            speaker = cvt['speaker']
            utterance = cvt['utterance']
            if random.random() < self.mask_prob:
                utterance = "[MASK]"
            chat.append(f"화자{speaker}: {utterance}")
        chat = "\n".join(chat)

        question = f"[Question]\n위 대화의 {inp['category']}"
        if (ord(inp['category'][-1]) - ord("가")) % 28 > 0:
            question += "으로"
        else:
            question = "로"
        question += " 올바른 지문은?"
                
        chat += "\n\n" + question + "\n\n[Option]\n"

        inferences = [
            inp['inference_1'],
            inp['inference_2'],
            inp['inference_3']
        ]
        label = self.answer_dict[example["output"]]

        order = list(range(len(inferences)))
        random.shuffle(order)
        
        shuffled_inferences = [inferences[i] for i in order]
        new_label = order.index(label)
        
        chat += f"A. {shuffled_inferences[0]}\n"
        chat += f"B. {shuffled_inferences[1]}\n"
        chat += f"C. {shuffled_inferences[2]}"

        message = [
            {"role": "system", "content": self.PROMPT},
            {"role": "user", "content": chat},
        ]

        source = self.tokenizer.apply_chat_template(
            message,
            add_generation_prompt=True,
            return_tensors="pt",
        )

        target = f"{['A', 'B', 'C'][new_label]}. {shuffled_inferences[new_label]}{self.tokenizer.eos_token}"

        target = self.tokenizer(target,
                                return_attention_mask=False,
                                add_special_tokens=False,
                                return_tensors="pt")
        target["input_ids"] = target["input_ids"].type(torch.int64)

        input_ids = torch.concat((source[0], target["input_ids"][0]))
        labels = torch.concat((torch.LongTensor([self.IGNORE_INDEX] * source[0].shape[0]), target["input_ids"][0]))
        
        return {
        'input_ids': input_ids,
        "labels": labels,
        }

In [5]:


class DevDataset(Dataset):
    def __init__(self, fname, tokenizer,k=10, fold_idx=0):
        IGNORE_INDEX=-100
        self.inp = []
        self.trg = []
        self.label = []

        PROMPT = '''You are an AI assistant that helps users analyze conversations and solve related problems. Please read the conversation carefully and select the most appropriate answer to the question based on the given options.'''
        answer_dict = {
            "": None,
            "inference_1": 0,
            "inference_2": 1,
            "inference_3": 2
        }

        with open(fname, "r", encoding='utf-8') as f:
            data = json.load(f)


        fold_size = len(data) // k
        start = fold_size*fold_idx
        end = start + fold_size
        data = data[start:end]

        
        def make_chat(inp):
            chat = ["[Conversation]"]
            for cvt in inp['conversation']:
                speaker = cvt['speaker']
                utterance = cvt['utterance']
                chat.append(f"화자{speaker}: {utterance}")
            chat = "\n".join(chat)

            question = f"[Question]\n위 대화의 {inp['category']}"
            if (ord(inp['category'][-1]) - ord("가")) % 28 > 0:
                question += "으로"
            else:
                question = "로"
            question += " 올바른 지문은?"
                
            chat = chat + "\n\n" + question + "\n\n[Option]\n"
            chat += f"A. {inp['inference_1']}\n"
            chat += f"B. {inp['inference_2']}\n"
            chat += f"C. {inp['inference_3']}"

            return chat
        
        for example in data:
            chat = make_chat(example["input"])
            message = [
                {"role": "system", "content": PROMPT},
                {"role": "user", "content": chat},
            ]
     
            source = tokenizer.apply_chat_template(
                message,
                add_generation_prompt=True,
                return_tensors="pt",
            )

            target = ""
            if example["output"] == "inference_1":
                target = f"A. {example['input']['inference_1']}{tokenizer.eos_token}"
            elif example["output"] == "inference_2":
                target = f"B. {example['input']['inference_2']}{tokenizer.eos_token}"
            elif example["output"] == "inference_3":
                target = f"C. {example['input']['inference_3']}{tokenizer.eos_token}"
                
            target = tokenizer(target,
                      return_attention_mask=False,
                      add_special_tokens=False,
                      return_tensors="pt")
            target["input_ids"] = target["input_ids"].type(torch.int64)

            input_ids = torch.concat((source[0], target["input_ids"][0]))
            labels = torch.concat((torch.LongTensor([IGNORE_INDEX] * source[0].shape[0]), target["input_ids"][0]))
            self.inp.append(input_ids)
            self.label.append(labels)
            #self.trg.append(answer_dict[example["output"]])

    def __len__(self):
        return len(self.inp)

    def __getitem__(self, idx):
        return self.inp[idx], self.label[idx]


class DataCollatorForSupervisedDataset(object):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, instances):
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(lbls) for lbls in labels], batch_first=True, padding_value=-100)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [6]:

import argparse

import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig,TaskType
from trl import SFTTrainer, SFTConfig
from sklearn.metrics import accuracy_score

# fmt: off
parser = argparse.ArgumentParser(prog="train", description="Training about Conversational Context Inference.")

g = parser.add_argument_group("Common Parameter")
g.add_argument("--model_id", default='kihoonlee/STOCK_SOLAR-10.7B',type=str,  help="model file path")
g.add_argument("--tokenizer", default='kihoonlee/STOCK_SOLAR-10.7B',type=str, help="huggingface tokenizer path")
g.add_argument("--save_dir", type=str, default="fold6", help="model save path")
g.add_argument("--batch_size", type=int, default=1, help="batch size (both train and eval)")
g.add_argument("--gradient_accumulation_steps", type=int, default=4, help="gradient accumulation steps")
g.add_argument("--warmup_steps", default=204,type=int, help="scheduler warmup steps")
g.add_argument("--lr", type=float, default=5e-5, help="learning rate")
g.add_argument("--epoch", type=int, default=10, help="training epoch")


g.add_argument("--fold_k", type=int, default=10, help="k-fold")
g.add_argument("--fold_num", type=int, default=6, help="fold_idx")
# fmt: on


def main(args):

    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        # bnb_4bit_use_double_quant=True,
        # bnb_4bit_quant_type="nf4",
        # bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    model = Llama3ForSFT.from_pretrained(
        args.model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        #quantization_config=bnb_config,
    )
    
    lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    #only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    #target_modules=["o_proj", "q_proj","k_proj","v_proj","gate_proj","up_proj","down_proj"],
    lora_dropout=0.0,
    bias='none',
    task_type=TaskType.CAUSAL_LM,
    )
    
    model = get_peft_model(model, lora_config)
    print(model.print_trainable_parameters())
    
    
    if args.tokenizer == None:
        args.tokenizer = args.model_id
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
    #tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
    tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n'  + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}"


    global LABEL_IDS, PAD
    LABEL_IDS= [tokenizer(i, add_special_tokens=False)['input_ids'][0] for i in ['A','B','C']]
    
    train_dataset = TrainDataset("merge.json", tokenizer,args.fold_k, args.fold_num)
    valid_dataset = DevDataset("merge.json", tokenizer,args.fold_k, args.fold_num)

    valid_dataset = Dataset.from_dict({
        'input_ids': valid_dataset.inp,
        "labels": valid_dataset.label,
        })
    
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

    training_args = SFTConfig(
        output_dir=args.save_dir,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        eval_strategy="steps",
        eval_steps=args.warmup_steps,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        learning_rate=args.lr,
        weight_decay=0.1,
        num_train_epochs=args.epoch,
        max_steps=-1,
        lr_scheduler_type="cosine",
        warmup_steps=args.warmup_steps,
        log_level="info",
        logging_steps=10,
        save_strategy="epoch",
        bf16=True,
        gradient_checkpointing=False,
        gradient_checkpointing_kwargs={"use_reentrant": False},
        max_seq_length=2048,
        packing=True,
        seed=42,
        report_to="none",
        #optim_args='grokadamw'
        
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    
    

# Step	Training Loss	Validation Loss
# 189	0.026200	0.016993
# 378	0.000600	0.026899
# 567	0.000200	0.017611
# 756	0.000200	0.018808
# 945	0.000100	0.015489
# 1134	0.000100	0.015717
# 1323	0.000100	0.019951
# 1512	0.000000	0.018297
# 1701	0.000000	0.016658
# 1890	0.000100	0.016785

In [7]:
if __name__ == "__main__":
    exit(main(parser.parse_args([])))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 14,155,776 || all params: 10,745,679,872 || trainable%: 0.1317
None


You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend
***** Running training *****
  Num examples = 819
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 2,040
  Number of trainable parameters = 14,155,776
  [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id
  labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(lbls) for lbls in labels], batch_first=True, padding_value=-100)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss,Accuracy
204,0.21,0.289756,0.911111
408,0.1547,0.264662,0.922222
612,0.2042,0.386464,0.933333
816,0.1672,0.230877,0.944444
1020,0.0,0.40657,0.944444
1224,0.0,0.297781,0.944444
1428,0.0,0.236264,0.966667
1632,0.0,0.265313,0.944444
1836,0.0,0.292797,0.955556
2040,0.0,0.320015,0.944444



***** Running Evaluation *****
  Num examples = 90
  Batch size = 1
  [torch.tensor(ids) for ids in input_ids], batch_first=True, padding_value=self.tokenizer.pad_token_id
  labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(lbls) for lbls in labels], batch_first=True, padding_value=-100)
Saving model checkpoint to fold6\checkpoint-204
loading configuration file config.json from cache at C:\Users\Gachon\.cache\huggingface\hub\models--kihoonlee--STOCK_SOLAR-10.7B\snapshots\3e60d55d5e1c63191de31d629380488b9bb5f5b4\config.json
Model config LlamaConfig {
  "_name_or_path": "kihoonlee/STOCK_SOLAR-10.7B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 4096,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 48,

In [8]:
# 189	0.000000	0.901135	0.834437
# 378	0.004200	0.556197	0.880795
# 567	0.000000	0.480005	0.887417
# 756	0.000100	0.515926	0.894040
# 945	0.001500	0.549613	0.913907
# 1134	0.000000	0.530577	0.920530
# 1323	0.000000	0.536749	0.913907
# 1512	0.000000	0.514958	0.920530
# 1701	0.000000	0.511815	0.927152
# 1890	0.000000	0.515028	0.920530