In [1]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import onnxruntime.training.api as ort_api
import torch
from datasets import load_dataset
from functools import partial
import time


  from .autonotebook import tqdm as notebook_tqdm


[2024-05-20 23:28:04,815] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [5]:

artifacts_dir = "artifacts_no_batchsize"

modelpath="microsoft/Phi-3-mini-4k-instruct"
dataset_name="g-ronimo/oasst2_top1_en"
lr=0.000008      # learning rate
bs=1            # batch size
bs_eval=16      # batch size for evals
ga_steps=16     # gradient acc. steps
epochs=1
max_length=1048      # samples max. length
output_dir="out"

tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)    # fast tokenizer sometimes ignores added tokens

# tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
# tokenizer.pad_token = "<PAD>"
print("tokenizer pad token: ", tokenizer.pad_token)
# tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

# Load Dataset
# dataset = load_dataset(dataset_name, download_mode="force_redownload")
dataset = load_dataset(dataset_name)
dataset = dataset["train"].train_test_split(test_size=0.1)

# chatML Template and tokenize dataset
templates=[
    "<|assistant|>\n{msg}<|end|>\n",
    "<|user|>\n{msg}<|end|>\n"
]
IGNORE_INDEX=32000

def get_position_ids(attention_mask):
    position_ids = attention_mask.long().cumsum(-1) - 1
    position_ids.masked_fill_(attention_mask == 0, 1)

    # Shape: (batch_size, sequence_length)
    return position_ids

# tokenize dataset, set input_ids and attention_mask to train on assistant outputs only
def tokenize(input, max_length):
    input_ids, attention_mask, position_ids, labels = [], [], [], []

    for i,msg in enumerate(input["conversation"]):
        isHuman = msg["role"]=="user"
        msg_chatml=templates[isHuman].format(msg=msg["content"])
        msg_tokenized=tokenizer(msg_chatml, truncation=False, add_special_tokens=False)

        input_ids+=msg_tokenized["input_ids"]
        attention_mask+=msg_tokenized["attention_mask"]
        labels+=[IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]

    return {
        "input_ids": input_ids[:max_length],
        "attention_mask": attention_mask[:max_length],
        "position_ids": get_position_ids(torch.tensor(attention_mask[:max_length])),
        "labels": labels[:max_length],
    }

print('attempting tokenizing dataset')

dataset_tokenized = dataset.map(
    partial(tokenize, max_length=max_length), 
    batched=False, 
    # num_proc=os.cpu_count(),    # multithreaded
    remove_columns=dataset["train"].column_names  # don't need this anymore, we have tokens from here on
)
print('after dataset_tokenized is generated yayy')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tokenizer pad token:  <|endoftext|>


attempting tokenizing dataset


Map:  71%|███████▏  | 3475/4877 [00:08<00:02, 481.86 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5216 > 4096). Running this sequence through the model will result in indexing errors
Map: 100%|██████████| 4877/4877 [00:11<00:00, 433.57 examples/s]
Map: 100%|██████████| 542/542 [00:01<00:00, 437.17 examples/s]

after dataset_tokenized is generated yayy





In [6]:

# collate function - to transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokens=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokens])

    for i,sample in enumerate(elements):
        input_ids=sample["input_ids"]
        labels=sample["labels"]
        position_ids=sample["position_ids"]
        attention_mask=sample["attention_mask"]

        pad_len=tokens_maxlen-len(input_ids)

        input_ids.extend( pad_len * [tokenizer.pad_token_id] )   
        labels.extend( pad_len * [IGNORE_INDEX] )    
        position_ids.extend( pad_len * [1] )
        attention_mask.extend( pad_len * [0] ) 

    batch={
        "input_ids": torch.tensor( [e["input_ids"] for e in elements] ).numpy(),
        "labels": torch.tensor( [e["labels"] for e in elements] ).numpy(),
        "position_ids": torch.tensor( [e["position_ids"] for e in elements] ).numpy(),
        "attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ).numpy(),
    }

    return batch

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map = "auto",
    trust_remote_code = True
)

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|██████████| 2/2 [00:00<00:00,  7.28it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:30<00:00, 15.18s/it]


In [12]:
matches = ["layers.32", "lm_head", "model.layers.31.mlp"]
for name, child in model.named_parameters():
    if any(match in name for match in matches):
        print(name)

model.layers.31.mlp.gate_up_proj.weight
model.layers.31.mlp.down_proj.weight
lm_head.weight
