# Set parameters

In [14]:
# modelpath="models/TinyLlama-1.1B-intermediate-step-1431k-3T"
modelpath="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_name="g-ronimo/oasst2_top1_en"
lr=0.00002      # learning rate
bs=1            # batch size
bs_eval=16      # batch size for evals
ga_steps=16     # gradient acc. steps
epochs=4
max_length=2048      # samples max. length
output_dir="out"

# Load model and tokenizer

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# model = AutoModelForCausalLM.from_pretrained(
#     modelpath,    
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
#     # attn_implementation="flash_attention_2",
# )

tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)    # fast tokenizer sometimes ignores added tokens

# Add ChatML tokens 

In [16]:
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))

1

# Load and prepare OA2 dataset

In [17]:
from datasets import load_dataset
from functools import partial
import os

# Load Dataset
dataset = load_dataset(dataset_name)
dataset = dataset["train"].train_test_split(test_size=0.1)

# chatML Template and tokenize dataset
templates=[
    "<|im_start|>assistant\n{msg}<|im_end|>",
    "<|im_start|>user\n{msg}<|im_end|>"
]
IGNORE_INDEX=-100

# tokenize dataset, set input_ids and attention_mask to train on assistant outputs only
def tokenize(input, max_length):
    input_ids, attention_mask, labels = [], [], []

    for i,msg in enumerate(input["conversation"]):
        isHuman = msg["role"]=="user"
        msg_chatml=templates[isHuman].format(msg=msg["content"])
        msg_tokenized=tokenizer(msg_chatml, truncation=False, add_special_tokens=False)
    
        input_ids+=msg_tokenized["input_ids"]
        attention_mask+=msg_tokenized["attention_mask"]
        labels+=[IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]

    return {
        "input_ids": input_ids[:max_length],
        "attention_mask": attention_mask[:max_length],
        "labels": labels[:max_length],
    }

dataset_tokenized = dataset.map(
    partial(tokenize, max_length=max_length), 
    batched=False, 
    # num_proc=os.cpu_count(),    # multithreaded
    remove_columns=dataset["train"].column_names  # don't need this anymore, we have tokens from here on
)

Map:   0%|          | 0/4877 [00:00<?, ? examples/s]Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not recognized.
Keyword arguments {'add_special_tokens': False} not 

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conversation'],
        num_rows: 4877
    })
    test: Dataset({
        features: ['conversation'],
        num_rows: 542
    })
})

In [19]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4877
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 542
    })
})

In [20]:
# collate function - to transform list of dictionaries [ {input_ids: [123, ..]}, {.. ] to single batch dictionary { input_ids: [..], labels: [..], attention_mask: [..] }
def collate(elements):
    tokens=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokens])

    for i,sample in enumerate(elements):
        input_ids=sample["input_ids"]
        labels=sample["labels"]
        attention_mask=sample["attention_mask"]

        pad_len=tokens_maxlen-len(input_ids)

        input_ids.extend( pad_len * [tokenizer.pad_token_id] )   
        labels.extend( pad_len * [IGNORE_INDEX] )    
        attention_mask.extend( pad_len * [0] ) 

    batch={
        "input_ids": torch.tensor( [e["input_ids"] for e in elements] ).numpy(),
        "labels": torch.tensor( [e["labels"] for e in elements] ).numpy(),
        "attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ).numpy(),
    }

    return batch

# Generating artifacts

In [50]:

from onnxruntime.training import artifacts
import torch
import onnx
import transformers
import numpy as np
transformers_model = transformers.LlamaForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", ignore_mismatched_sizes=True)
tokenizer = transformers.AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
dataloader = torch.utils.data.DataLoader(dataset_tokenized["train"], batch_size=bs, shuffle=True, collate_fn = collate)
batch = {}
for batch_from_dl in dataloader:
    batch = batch_from_dl
    break
inputs = (torch.tensor(batch['input_ids'], dtype=torch.int), torch.tensor(batch['attention_mask'], dtype=torch.int))
print(inputs[0].shape)
class FlatModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *local_inputs):
        # return self.model(batch['input_ids'], batch['attention_mask'])
        return self.model(inputs[0], inputs[1])

model = FlatModel(transformers_model)
# inputs = (np.array(batch['input_ids']).tolist(), np.array(batch['attention_mask']).tolist())
# model = transformers_model
input_names = ["input_ids", "attention_mask"]
output_names = ["loss", "logits"]

torch.onnx.export(model,
                  inputs,
                #   (torch.Tensor(batch['input_ids']), torch.Tensor(batch['attention_mask'])),
                #   (inputs["input_kids"], inputs["attention_mask"]),
                #   batch,
                  "tinyllama_1_hidden.onnx",
                  input_names = input_names, 
                  output_names = output_names,
                  export_params=True,
                  opset_version=14,
                  training=torch.onnx.TrainingMode.TRAINING,
                  do_constant_folding=False,
                  dynamic_axes={
                    "input_ids": {0: "batch_size", 1: "sequence_length"},
                    "attention_mask": {0: "batch_size", 1: "sequence_length"},
                    "logits": {0: "batch_size", 1: "sequence_length"}
                  }
                  )

print('done with exporting')
onnx_model_path = "tinyllama_1_hidden.onnx"
# onnx_model_path = "mnist.onnx"
onnx_model = onnx.load(onnx_model_path)
requires_grad = [param.name for param in onnx_model.graph.initializer] # if param.name not in requires_grad]
frozen_params = []
artifacts.generate_artifacts(
    onnx_model,
    requires_grad=requires_grad,
    frozen_params=frozen_params,
    loss=artifacts.LossType.CrossEntropyLoss,
    artifact_directory="artifacts_generated_full",
    optimizer=artifacts.OptimType.AdamW,
    ort_format=False,
    loss_input_names=["loss"]
)

Some weights of the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 were not used when initializing LlamaForCausalLM: ['model.layers.1.input_layernorm.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.10.input_layernorm.weight', 'model.layers.10.mlp.down_proj.weight', 'model.layers.10.mlp.gate_proj.weight', 'model.layers.10.mlp.up_proj.weight', 'model.layers.10.post_attention_layernorm.weight', 'model.layers.10.self_attn.k_proj.weight', 'model.layers.10.self_attn.o_proj.weight', 'model.layers.10.self_attn.q_proj.weight', 'model.layers.10.self_attn.v_proj.weight', 'model.layers.11.input_layernorm.weight', 'model.layers.11.mlp.down_proj.weight', 'model.layers.11.mlp.gate_proj.weig

torch.Size([1, 1102])


IndexError: index out of range in self

# Train

In [21]:
import torch
import onnxruntime.training.api as ort_api
import onnx

In [22]:
# state = ort_api.CheckpointState.load_checkpoint('artifacts_generated_l1/checkpoint')
# training_model = ort_api.Module('artifacts_generated_l1/training_model_corrected_labels.onnx', state, 'artifacts_generated_l1/eval_model.onnx')
# optimizer = ort_api.Optimizer('artifacts_generated_l1/optimizer_model.onnx', training_model)

state = ort_api.CheckpointState.load_checkpoint('artifacts_generated_full/checkpoint')
training_model = ort_api.Module('artifacts_generated_full/training_model.onnx', state, 'artifacts_generated_full/eval_model.onnx')
optimizer = ort_api.Optimizer('artifacts_generated_full/optimizer_model.onnx', training_model)

In [23]:
dataloader = torch.utils.data.DataLoader(dataset_tokenized["train"], batch_size=bs, shuffle=True, collate_fn = collate)

In [24]:
training_model.input_names()

['input_ids', 'attention_mask', 'labels']

In [48]:
def trainEpoch():
    training_model.train()
    losses = []
    i = 0
    for batch in dataloader:
        print(i, 'out of', len(dataloader))
        forward_inputs = [batch["input_ids"], batch["attention_mask"], batch["labels"]]
        print("input ids shape", batch["input_ids"].shape)
        print("attention mask shape", batch["attention_mask"].shape)
        print("labels shape", batch["labels"].shape)

        loss, _ = training_model(*forward_inputs)
        print('after training acll')
        optimizer.step()
        training_model.lazy_reset_grad()
        losses.append(loss)
        print(loss)
        i += 1

In [49]:
trainEpoch()

0 out of 4877


AttributeError: 'numpy.ndarray' object has no attribute 'type'

In [20]:
import onnx

model = onnx.load("artifacts_generated_l1/training_model.onnx")


In [21]:
print(model.graph.input[2])
import copy
labels_input = copy.deepcopy(model.graph.input[0])
labels_input.name = "labels"
labels_input.type.tensor_type.elem_type = onnx.TensorProto.INT64
model.graph.input[2].CopyFrom(labels_input)
print(model.graph.input[2].type.tensor_type.shape)

name: "labels"
type {
  tensor_type {
    elem_type: 7
    shape {
      dim {
        dim_param: "Castloss_dim_0"
      }
      dim {
        dim_value: 32000
      }
    }
  }
}

dim {
  dim_param: "batch_size"
}
dim {
  dim_param: "sequence_length"
}



In [22]:
onnx.save(model, "artifacts_generated_l1/training_model_corrected_labels.onnx")