In [None]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
from utils import get_config
config = get_config("configs/mistral_7b.json")

In [2]:
config

{'trainer': {'evaluation_strategy': 'steps',
  'per_device_train_batch_size': 2,
  'per_device_eval_batch_size': 2,
  'gradient_accumulation_steps': 64,
  'eval_steps': 50,
  'save_steps': 50,
  'logging_steps': 5,
  'learning_rate': 0.00025,
  'num_train_epochs': 4,
  'lr_scheduler_type': 'cosine',
  'warmup_steps': 30,
  'fp16': False,
  'bf16': True,
  'torch_compile': False,
  'optim': 'adamw_torch'},
 'lora': {'r': 8,
  'lora_alpha': 16,
  'lora_dropout': 0.05,
  'bias': 'none',
  'target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj'],
  'task_type': 'CAUSAL_LM'},
 'load_in_8bit': True,
 'only_target_loss': True,
 'model_name': 'mistralai/Mistral-7B-Instruct-v0.1',
 'is_adapter': False,
 'max_source_tokens_count': 1000,
 'max_target_tokens_count': 1000}

In [3]:
from transformers import AutoTokenizer
from dotenv import load_dotenv
load_dotenv()
model_name = config["model_name"]

tokenizer_mistral = AutoTokenizer.from_pretrained(model_name)

In [4]:
config = get_config("configs/llama2_7b_lora.json")
model_name = config["model_name"]
tokenizer_llama = AutoTokenizer.from_pretrained(model_name)

In [5]:
tokenizer_mistral.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [6]:
tokenizer_llama.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [7]:
tokenizer_mistral.vocab


{'▁tragedy': 25466,
 'iss': 815,
 '皮': 31279,
 '▁profound': 19327,
 '▁medicine': 12502,
 '!)': 14280,
 '▁pain': 3358,
 '▁Bag': 17699,
 '▁flags': 6809,
 '▁cush': 25602,
 'itel': 13745,
 'ibility': 3032,
 'ে': 29914,
 '岩': 31581,
 '▁part': 744,
 '▁electricity': 17242,
 '▁bad': 2607,
 'iels': 23029,
 'бор': 12660,
 'rq': 11916,
 '▁Define': 27350,
 '▁twelve': 13153,
 'xml': 6459,
 '=[': 12303,
 'scrib': 23418,
 '深': 30330,
 'ᴛ': 31083,
 '▁Parker': 19673,
 'NR': 20409,
 '▁че': 4640,
 'Serialization': 21046,
 '▁Feder': 12969,
 '▁Where': 6926,
 '▁aboard': 22446,
 'notice': 26814,
 '▁git': 19163,
 '▁nl': 25745,
 'dots': 10234,
 'pgfpath': 11370,
 '▁happen': 4804,
 'Bal': 13450,
 'Zone': 13962,
 '▁fixture': 20284,
 'Changed': 8195,
 '▁учи': 22102,
 '▁Kor': 17872,
 '▁BAS': 11502,
 '▁Rights': 12744,
 '▁Mall': 20098,
 'тель': 4446,
 'iffe': 27007,
 'ponential': 25723,
 '▁Formula': 28532,
 'pgfscope': 28593,
 '▁Williams': 10606,
 'Men': 22963,
 'ძ': 31464,
 '▁shorter': 19367,
 'idden': 4131,
 '5': 

In [8]:
tokenizer_mistral.encode("Mohamed Amri")

[1, 13610, 3000, 2740, 373]

In [9]:
tokenizer_llama.encode("Mohamed Amri")

[1, 12929, 2795, 1913, 374]

In [10]:
len(tokenizer_mistral.vocab), len(tokenizer_llama.vocab)

(32000, 32000)

In [11]:
class FineTuner:
    def __init__(self, config_file: str):
        self.config = get_config(config_file)
        self.model_name = self.config["model_name"]
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.lora_config = self.config.get("lora")
        self.only_target_loss = self.config.get("only_target_loss", True)
        self.max_source_tokens_count = self.config["max_source_tokens_count"]
        self.max_target_tokens_count = self.config["max_target_tokens_count"]
        
        print(f"Model name -> {self.model_name}")
        print(f"lora_config -> {self.lora_config}")
        print(f"only_target_loss -> {self.only_target_loss}")
        print(f"max_source_tokens_count -> {self.max_source_tokens_count}")
        print(f"max_target_tokens_count -> {self.max_target_tokens_count}")

    def fix_tokenizer(self):
        tokenizer = self.tokenizer
        special_tokens = dict()
        for token_id in range(1000):
            token = tokenizer.convert_ids_to_tokens(token_id)
            if tokenizer.pad_token_id in (None, tokenizer.vocab_size) and "pad" in token:
                special_tokens["pad_token"] = token
            if tokenizer.bos_token_id in (None, tokenizer.vocab_size) and "<s>" in token:
                special_tokens["bos_token"] = token
            if tokenizer.eos_token_id in (None, tokenizer.vocab_size) and "</s>" in token:
                special_tokens["eos_token"] = token
            if tokenizer.unk_token_id in (None, tokenizer.vocab_size) and "unk" in token:
                special_tokens["unk_token"] = token
            if tokenizer.sep_token_id in (None, tokenizer.vocab_size) and "sep" in token:
                special_tokens["sep_token"] = token
    
        if (
            tokenizer.sep_token_id in (None, tokenizer.vocab_size)
            and "bos_token" in special_tokens
        ):
            special_tokens["sep_token"] = special_tokens["bos_token"]
    
        if (
            tokenizer.pad_token_id in (None, tokenizer.vocab_size)
            and "pad_token" not in special_tokens
        ):
            if tokenizer.unk_token_id is not None:
                special_tokens["pad_token"] = tokenizer.unk_token
            else:
                special_tokens["pad_token"] = "<|pad|>"
    
        if (
            tokenizer.sep_token_id in (None, tokenizer.vocab_size)
            and "sep_token" not in special_tokens
        ):
            if tokenizer.bos_token_id is not None:
                special_tokens["sep_token"] = tokenizer.bos_token
            else:
                special_tokens["sep_token"] = "<|sep|>"
    
        tokenizer.add_special_tokens(special_tokens)
    
        print("Vocab size: ", tokenizer.vocab_size)
        print("PAD: ", tokenizer.pad_token_id, tokenizer.pad_token)
        print("BOS: ", tokenizer.bos_token_id, tokenizer.bos_token)
        print("EOS: ", tokenizer.eos_token_id, tokenizer.eos_token)
        print("UNK: ", tokenizer.unk_token_id, tokenizer.unk_token)
        print("SEP: ", tokenizer.sep_token_id, tokenizer.sep_token)
        self.tokenizer = tokenizer
                    

In [12]:
fine_tuner = FineTuner("configs/mistral_7b.json")

Model name -> mistralai/Mistral-7B-Instruct-v0.1
lora_config -> {'r': 8, 'lora_alpha': 16, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj'], 'task_type': 'CAUSAL_LM'}
only_target_loss -> True
max_source_tokens_count -> 1000
max_target_tokens_count -> 1000


In [13]:
fine_tuner.fix_tokenizer()

Vocab size:  32000
PAD:  0 <unk>
BOS:  1 <s>
EOS:  2 </s>
UNK:  0 <unk>
SEP:  1 <s>


In [14]:
fine_tuner.lora_config

{'r': 8,
 'lora_alpha': 16,
 'lora_dropout': 0.05,
 'bias': 'none',
 'target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj'],
 'task_type': 'CAUSAL_LM'}

In [39]:
from typing import Dict, Literal, List
from pydantic import BaseModel
from tqdm import tqdm

import torch
from torch.utils.data import Dataset

class Instruction(BaseModel):
    instruction: str
    input: str
    output: str
    source: str
    raw_entities: Dict[str, List[str]]
    
class InstructDataset(Dataset):
    def __init__(
            self,
            instructions: List[Instruction],
            tokenizer,
            max_source_tokens_count: int,
            max_target_tokens_count: int,
            model_type: Literal["llama", "mistral", "rwkv", "t5"] = "mistral",
            only_target_loss: bool = True,
            padding: bool = False
    ):

        self.instructions = instructions
        self.tokenizer = tokenizer
        self.max_source_tokens_count = max_source_tokens_count
        self.max_target_tokens_count = max_target_tokens_count
        self.model_type = model_type
        self.only_target_loss = only_target_loss
        self.padding = padding
        
        self.processed_instructions = []
        
        if self.model_type not in ["llama", "mistral", "rwkv", "t5"]:
            raise ValueError("model_type must be either llama or mistral or rwkv or t5")
        
        for instruction in tqdm(instructions):
            if self.model_type in ["llama", "mistral", "rwkv"]:
                tensors = self.convert_instruction_causal(instruction)
            else:
                tensors = self.convert_instruction_seq2seq(instruction)
            
            self.processed_instructions.append(tensors)
            
    def __len__(self):
        return len(self.processed_instructions)
    
    def __getitem__(self, index):
        return self.processed_instructions[index]
    
    def convert_instruction_causal(self, instruction: Instruction):
        target = instruction.output
        source = instruction.source
        
        source_tokens = self.tokenizer(
            source,
            add_special_tokens=False,
            max_length=self.max_source_tokens_count,
            padding=False,
            truncation=True,
        )["input_ids"]
        
        if self.tokenizer.bos_token_id:
            source_tokens.insert(0, self.tokenizer.bos_token_id)
            
        input_ids = source_tokens[:]
        max_length = self.max_source_tokens_count + self.max_target_tokens_count + 2
        
        target_tokens = self.tokenizer(
            target,
            add_special_tokens=False,
            max_length=self.max_target_tokens_count,
            padding=False,
            truncation=True,
        )["input_ids"]
        
        input_ids += target_tokens + [self.tokenizer.eos_token_id]
        actual_length = len(input_ids)

        if self.padding:
            padding = [self.tokenizer.pad_token_id for _ in range(actual_length, max_length)]
            input_ids.extend(padding)
            
        input_ids = torch.LongTensor(input_ids)
        labels = input_ids.clone()
        attention_mask = input_ids.new_ones(input_ids.size())
        
        if self.padding:
            labels[actual_length:] = -100
            attention_mask[actual_length:] = 0
            
        if self.only_target_loss:
            labels[:len(source_tokens)] = -100
            
        return{
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }
    
    def convert_instruction_seq2seq(self, instruction: Instruction):
        target = instruction.output
        source = instruction.source
        
        inputs = self.tokenizer(
            source, 
            add_special_tokens=True,
            max_length=self.max_source_tokens_count,
            padding=False,
            truncation=True,
            return_tensors="pt",
        )
        
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        
        outputs = self.tokenizer(
            target, 
            add_special_tokens=True,
            max_length=self.max_target_tokens_count,
            padding=False,
            truncation=True,
            return_tensors="pt",
        )
        labels = outputs["input_ids"].squeeze(0).tolist()
        if labels[-1] != self.tokenizer.eos_token_id:
            labels.append(self.tokenizer.eos_token_id)
            
        inputs["labels"] = torch.LongTensor(labels)
        return inputs

In [40]:
instruct_dataset = InstructDataset(
    instructions=[Instruction(
        instruction="Extract entities",
        input="whiteub;lk;l ",
        output="lkn;l;';",
        source="unknown",
        raw_entities={"colors": ["white"]}
    )],
    tokenizer=fine_tuner.tokenizer,
    max_source_tokens_count=10,
    max_target_tokens_count=10,
    model_type="t5",
    only_target_loss=True,
    padding=True,
)
instruct_dataset[0]

100%|██████████| 1/1 [00:00<00:00, 168.18it/s]


{'input_ids': tensor([   1, 9038]),
 'attention_mask': tensor([1, 1]),
 'labels': tensor([    1,   305, 13223, 28745, 28714, 28745,  1775,     2])}

In [68]:
from datasets import load_dataset
