In [40]:
import copy
from dataclasses import dataclass, field
import json
import pathlib
from typing import Dict, Optional, Sequence

import numpy as np
import torch
from torch.utils.data import Dataset
import transformers
from transformers.trainer_pt_utils import LabelSmoother #código para evitar overconfidence no modelo
from fastchat.conversation import SeparatorStyle
from fastchat.model.model_adapter import get_conversation_template
IGNORE_TOKEN_ID = LabelSmoother.ignore_index
import json
from peft import PeftModel

from transformers import AutoTokenizer, AutoModelForCausalLM

In [46]:
model_name = 'helloollel/vicuna-7b'
model_adapted = './output/'
model = transformers.AutoModelForCausalLM.from_pretrained(
        model_name
    )
model.config.use_cache = False
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.10s/it]


In [47]:
model_test = PeftModel.from_pretrained(model,model_adapted)

In [48]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(
            in_features=4096, out_features=4096, bias=False
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.0

In [49]:
model_test.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
              (v_proj): 

In [12]:
local_rank = None

def rank0_print(*args):
    if local_rank==0:
        print(*args)

In [13]:
source = json.load(open("dummy.json"))

In [14]:
new_source = []
for i in source:
    new_conv = []
    for j in i['conversations']:
        if j['from'] == 'human':
            new_conv.append({"from":"client", "value":j['value']})
        else:
            new_conv.append({"from":"agent", "value":j['value']})
    new_source.append(new_conv)
    

In [17]:
def preprocess(sources,
               tokenizer: transformers.PreTrainedTokenizer,
               ) -> Dict:
    """Preprocesses the data into a format suitable for training."""
    conv = get_conversation_template("vicuna")
    roles = {"client":conv.roles[0],"agent":conv.roles[1]}

    #Apply prompt templates
    conversations = []
    for i, source in enumerate(sources):
        if roles[source[0]["from"]] != conv.roles[0]:
             # Skip the first one if it is not from client
             source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j%2], f"{i}"
            conv.append_message(role, sentence["value"])
        conversations.append(conv.get_prompt())

    # Tokenize conversations
    input_ids = tokenizer(
        conversations,
        return_tensors="pt",
        padding="max_length",
        max_length=256,
        truncation=True,
    ).input_ids
    targets = input_ids.clone()
    
    assert conv.sep_style == SeparatorStyle.ADD_COLON_TWO

    #Mask targets
    sep = conv.sep + conv.roles[1] + ": "
    for conversation, target in zip(conversations,targets):
        total_len = int(target.ne(tokenizer.pad_token_id).sum())

        rounds = conversation.split(conv.sep2)
        cur_len = 1
        target[:cur_len] = IGNORE_TOKEN_ID

        for i, rou in enumerate(rounds):
            if rou =="":
                break
            
            parts = rou.split(sep)
            if len(parts) != 2:
                break
            
            parts[0] +=sep
            round_len = len(tokenizer(rou).input_ids)
            instruction_len = len(tokenizer(parts[0]).input_ids) - 2

            target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
            
            cur_len += round_len
        target[cur_len:] = IGNORE_TOKEN_ID

        if False:
            z = target.clone()
            z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id,z)
            rank0_print(tokenizer.decode(z))
        
        if cur_len < tokenizer.model_max_length:
            if cur_len != total_len:
                target[:] = IGNORE_TOKEN_ID
                rank0_print(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}"
                    f"(ignored)"
                )
    return dict(
        input_ids=input_ids,
        labels=targets,
        attention_mask = input_ids.ne(tokenizer.pad_token_id),
    )

In [27]:
class SupervisedDataset(Dataset):
    """ Dataset for supervised fine-tuning"""

    def __init__(self, raw_data,tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()

        rank0_print("Formatting inputs...")
        sources = [example["conversations"] for example in raw_data]
        data_dict = preprocess(sources, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        self.attention_mask = data_dict["attention_mask"]

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, i) -> Dict[str,torch.Tensor]:
        return dict(
            input_ids=self.input_ids[i],
            labels=self.labels[i],
            attention_mask=self.attention_mask[i],
        )

In [28]:
class LazySupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning"""
    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
        super(LazySupervisedDataset, self).__init__()
        self.tokenizer = tokenizer
        rank0_print("Formatting inputs... Skip in lazy mode")
        self.tokenizer = tokenizer
        self.raw_data = raw_data
        self.cached_data_dict = {}
        
    def __len__(self):
        return len(self.raw_data)
    
    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        if i in self.cached_data_dict:
            return self.cached_data_dict[i]
        
        ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer)
        ret = dict(
            input_ids=ret["input_ids"][0],
            labels=ret["labels"][0],
            attention_mask=ret["attention_mask"][0],
        )
        self.cached_data_dict[i] = ret
        return ret

In [29]:
def make_supervised_data_module(
        tokenizer: transformers.PreTrainedTokenizer,data_path, lazy_preprocess: bool = False
) -> list:
    """Make dataset and collator for supervised fine-tuning."""
    dataset_cls = (
        LazySupervisedDataset if lazy_preprocess else SupervisedDataset
    )
    rank0_print("Loading data...")
    raw_data = json.load(open(data_path,"r"))

    #Split train/test
    perm = np.random.permutation(len(raw_data))
    split = int(len(perm)*0.98)
    train_indices = perm[:split]
    eval_indices = perm[split:]
    train_raw_data = [raw_data[i] for i in train_indices]
    eval_raw_data = [raw_data[i] for i in eval_indices]
    rank0_print(f'#train: {len(train_raw_data)} #eval: {len(eval_raw_data)}')

    train_dataset = dataset_cls(raw_data=train_raw_data, tokenizer=tokenizer)
    eval_dataset = dataset_cls(raw_data=eval_raw_data, tokenizer=tokenizer)
    return [train_dataset,eval_dataset]

In [30]:
if tokenizer.pad_token is None:
        print("Adding pad token to tokenizer")
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        model.resize_token_embeddings(len(tokenizer))
else:
        print(tokenizer.pad_token)
data_module = make_supervised_data_module(tokenizer = tokenizer, data_path='dummy.json',lazy_preprocess=False)

[PAD]
