## Generate training artifacts

In [1]:
from onnxruntime.training import artifacts
import torch
import onnx
import transformers
# from fastT5 import export_and_get_onnx_model
import onnxruntime.training.api as ort_api

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model = export_and_get_onnx_model("MBZUAI/LaMini-T5-61M")

In [3]:
# pipeline = transformers.pipeline(
#     "text-generation",
#     model="distilbert/distilgpt2",
# )

# transformers_model = transformers.AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-Cerebras-111M")
# transformers_model = transformers.AutoModel.from_pretrained("facebook/blenderbot-400M-distill")
# transformers_model = transformers.AutoModel.from_pretrained("distilbert/distilgpt2")
transformers_model_gpt = transformers.GPT2Model.from_pretrained("distilgpt2")

In [4]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained("distilgpt2")
# tokenizer = transformers.GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 464, 3139,  286, 4881,  318, 6342,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [5]:
class FlatModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *local_inputs):
        return self.model(inputs.input_ids, inputs.attention_mask)

# model = FlatModel(pipeline.model)
model = FlatModel(transformers_model_gpt)

In [6]:
input_names = ["input_ids", "attention_mask"]
output_names = ["loss", "logits"]

torch.onnx.export(model,
                  (inputs["input_ids"], inputs["attention_mask"]),
                  "distilgpt2.onnx",
                  input_names = input_names, 
                  output_names = output_names,
                  export_params=True,
                  training=torch.onnx.TrainingMode.TRAINING,
                  do_constant_folding=False,
                  opset_version=15,
                  dynamic_axes={
                    "input_ids": {0: "batch_size", 1: "sequence_length"},
                    "attention_mask": {0: "batch_size", 1: "sequence_length"},
                    "logits": {0: "batch_size", 1: "sequence_length"}
                  }
                  )

  if batch_size <= 0:


In [7]:
requires_grad = []
frozen_params = []
num_named_params = 0
for name, param in model.named_parameters():
    num_named_params += 1
    if param.requires_grad:
        requires_grad.append(name)
    else:
        frozen_params.append(name)

for name, param in model.named_buffers():
    frozen_params.append(name)

# onnx_model = onnx.load("tinyllama.onnx")
onnx_model = onnx.load("distilgpt2.onnx")

artifacts.generate_artifacts(
    onnx_model,
    requires_grad=requires_grad,
    frozen_params=frozen_params,
    optimizer=artifacts.OptimType.AdamW,
    # loss=artifacts.LossType.CrossEntropyLoss,
)



## Data pre-processing

In [4]:
import numpy as np
import pandas as pd
from datasets import Dataset

In [5]:
df = pd.read_csv('../data/dialogueText.csv')

In [6]:
current_convo =[]
convos = []
current_user = ""
current_responder = ""

for ind in df.index:
    if len(current_user) == 0 or str(df['from'][ind]) == current_user:
        # first convo OR continuing current user
        current_user = str(df['from'][ind])
        current_convo.append(
            {
                "role": "user", 
                "content": str(df['text'][ind])
            }
        )
    elif str(df['from'][ind]) != current_user and str(df['from'][ind]) != current_responder and len(current_responder) != 0:
        # new user & new convo
        convos.append(current_convo)
        current_convo = []
        current_user = str(df['from'][ind])
        current_responder = ""
        current_convo.append(
            {
                "role": "user", 
                "content": str(df['text'][ind])
            }
        )
    elif str(df['from'][ind]) != current_user:
        if str(df['from'][ind]) != current_responder:
            current_responder = str(df['from'][ind])
        current_convo.append(
            {
                "role": "assistant", 
                "content": str(df['text'][ind])
            }
        )
    else:
        print("ERROR: fall through -- you missed a case!", df['from'][ind], current_user, current_responder)

In [26]:
convos[0]

[{'role': 'user',
  'content': "Hello folks, please help me a bit with the following sentence: 'Order here your personal photos or videos.' - I think the only allowed version is 'Order your personal videos or photos here.', but I'm not sure, are you?"},
 {'role': 'user',
  'content': 'Did I choose a bad channel? I ask because you seem to be dumb like windows user'},
 {'role': 'assistant',
  'content': 'the second sentence is better english   and we are not dumb'}]

In [33]:
targets = []
inputs = []
for convo in convos:
    for message in reversed(range(len(convo))):
        if convo[message]["role"] == "assistant":
            targets.append(convo[message]['content'])
            inputs.append(convo[:message])
            break

In [34]:
print(inputs[0])
print(targets[0])
print(len(inputs))
print(len(targets))

[{'role': 'user', 'content': "Hello folks, please help me a bit with the following sentence: 'Order here your personal photos or videos.' - I think the only allowed version is 'Order your personal videos or photos here.', but I'm not sure, are you?"}, {'role': 'user', 'content': 'Did I choose a bad channel? I ask because you seem to be dumb like windows user'}]
the second sentence is better english   and we are not dumb
345692
345692


In [42]:
llama_tokenizer = transformers.AutoTokenizer.from_pretrained("Xenova/TinyLlama-1.1B-Chat-v1.0", padding_side="left")

In [36]:
inputs_dict = Dataset.from_dict({"inputs": inputs, "targets": targets})
templated_convos_2 = inputs_dict.map(lambda x: {"formatted_inputs": llama_tokenizer.apply_chat_template(x["inputs"], tokenize=False, add_generation_prompt=True)})
# templated_convos_2 = inputs_dict.map(lambda x: {"formatted_targets": llama_tokenizer.apply_chat_template(x["targets"], tokenize=False, add_generation_prompt=False)})

Map: 100%|██████████| 345692/345692 [00:33<00:00, 10430.49 examples/s]


In [37]:
templated_convos_2

Dataset({
    features: ['inputs', 'targets', 'formatted_inputs'],
    num_rows: 345692
})

In [56]:
for i in range(2):
    print(str(templated_convos_2["formatted_inputs"][i]))
    print('end')

<|user|>
Hello folks, please help me a bit with the following sentence: 'Order here your personal photos or videos.' - I think the only allowed version is 'Order your personal videos or photos here.', but I'm not sure, are you?</s>
<|user|>
Did I choose a bad channel? I ask because you seem to be dumb like windows user</s>
<|assistant|>

end
<|user|>
Sock Puppe?t</s>
<|user|>
WTF?</s>
<|assistant|>

end


In [57]:
# TODO: fix how the formatted chats are written to the file so that each convo is ddistinct
# TODO: write the targets to a separate file
data_file = open("formatted_chat_500_convos.txt", "w")

for i in range(len(templated_convos_2["formatted_inputs"])):
    data_file.write(str(templated_convos_2["formatted_inputs"][i]) + "\n")

data_file.close()

data_file = open("targets_500_convos.txt", "w")

data_file.writelines(templated_convos_2["targets"])

data_file.close()

KeyboardInterrupt: 

## Test training artifacts by using the Python training API

In [38]:
## test generated artifacts
# create checkpoint state

state = ort_api.CheckpointState.load_checkpoint("artifacts_generated_full/checkpoint")

training_model = ort_api.Module('artifacts_generated_full/training_model.onnx', state, 'artifacts_generated_full/eval_model.onnx')

optimizer = ort_api.Optimizer('artifacts_generated_full/optimizer_model.onnx', training_model)
# state = ort_api.CheckpointState.load_checkpoint("tinyllama_artifacts_single_layer/checkpoint.zip")

# training_model = ort_api.Module('tinyllama_artifacts_single_layer/training_model.onnx', state, 'tinyllama_artifacts_single_layer/eval_model.onnx')

# optimizer = ort_api.Optimizer('tinyllama_artifacts_single_layer/optimizer_model.onnx', training_model)

In [43]:
# tokenized_dataset = templated_convos_2.map(lambda x: tokenizer(x["formatted_chat"], max_length=1024, padding='max_length', truncation=True, return_tensors='np'))
tokenized_dataset = templated_convos_2.map(lambda x: llama_tokenizer(x["formatted_inputs"], text_target=x["targets"], max_length=1024, padding='max_length', truncation=True, return_tensors='np'))
# tokenized_target_dataset = tokenized_dataset.map(lambda x: llama_tokenizer(x["targets"], max_length=1024, padding='max_length', truncation=True, return_tensors='np'))

Map: 100%|██████████| 345692/345692 [04:24<00:00, 1304.60 examples/s]


In [44]:
tokenized_dataset

Dataset({
    features: ['inputs', 'targets', 'formatted_inputs', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 345692
})

In [47]:
tokenized_dataset = tokenized_dataset.remove_columns(["inputs", "targets", "formatted_inputs"])

In [48]:
dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=8)

In [51]:
for batch in dataloader:
    print(np.transpose(np.array(batch["input_ids"][0])).shape)
    print(np.array(batch["labels"][0]).shape)
    break

(8, 1024)
(1024, 8)


In [53]:
def train(epoch):
    training_model.train()
    losses = []
    i = 0
    for batch in dataloader:
        print(i, 'out of ', len(dataloader))
        input_ids = np.transpose(np.array(batch["input_ids"][0]))
        # input_ids = np.array(batch["input_ids"][0])
        # attention_mask = np.array(batch["attention_mask"][0])
        attention_mask = np.transpose(np.array(batch["attention_mask"][0]))
        targets = np.transpose(np.array(batch["labels"][0]))
        print('input ids shape', input_ids.shape)
        print('attention_mask shape', attention_mask.shape)
        forward_inputs = [input_ids, attention_mask, targets]
        loss, _ = training_model(*forward_inputs)
        print('after training')
        optimizer.step()
        print('after optimizing')
        training_model.lazy_reset_grad()
        losses.append(loss.item())
        print(loss)
        i += 1

In [52]:
training_model.input_names()

['input_ids', 'attention_mask', 'target']

In [54]:
for epoch in range(3):
    train(epoch)

0 out of  43212
input ids shape (8, 1024)
attention_mask shape (8, 1024)


RuntimeError: C:\a\_work\1\s\orttraining\orttraining\training_api\module.cc:538 onnxruntime::training::api::Module::TrainStep [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Unexpected input data type. Actual: (tensor(int64)) , expected: (tensor(float))
