## Generate training artifacts

In [1]:
from onnxruntime.training import artifacts
import torch
import onnx
import transformers
# from fastT5 import export_and_get_onnx_model
import onnxruntime.training.api as ort_api

  from .autonotebook import tqdm as notebook_tqdm


In [65]:
# model = export_and_get_onnx_model("MBZUAI/LaMini-T5-61M")

In [17]:
# pipeline = transformers.pipeline(
#     "text-generation",
#     model="distilbert/distilgpt2",
# )

# transformers_model = transformers.AutoModelForCausalLM.from_pretrained("MBZUAI/LaMini-Cerebras-111M")
# transformers_model = transformers.AutoModel.from_pretrained("facebook/blenderbot-400M-distill")
transformers_model = transformers.AutoModel.from_pretrained("distilbert/distilgpt2")
# transformers_model_gpt = transformers.GPT2Model.from_pretrained("distilgpt2")

In [2]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained("distilgpt2")
# tokenizer = transformers.GPT2Tokenizer.from_pretrained("distilbert/distilgpt2")
inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 464, 3139,  286, 4881,  318, 6342,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [68]:
class FlatModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, *local_inputs):
        return self.model(inputs.input_ids, inputs.attention_mask)

# model = FlatModel(pipeline.model)
model = FlatModel(transformers_model)

In [69]:
input_names = ["input_ids", "attention_mask"]
output_names = ["loss", "logits"]

torch.onnx.export(model,
                  (inputs["input_ids"], inputs["attention_mask"]),
                  "distilgpt2.onnx",
                  input_names = input_names, 
                  output_names = output_names,
                  export_params=True,
                  training=torch.onnx.TrainingMode.TRAINING,
                  do_constant_folding=False,
                  opset_version=15,
                  dynamic_axes={
                    "input_ids": {0: "batch_size", 1: "sequence_length"},
                    "attention_mask": {0: "batch_size", 1: "sequence_length"},
                    "logits": {0: "batch_size", 1: "sequence_length"}
                  }
                  )

input_ids: tensor([[ 464, 3139,  286, 4881,  318, 6342,   13]])
past key values: None
attention_mask tensor([[1, 1, 1, 1, 1, 1, 1]])


  if batch_size <= 0:


In [70]:
requires_grad = []
frozen_params = []
num_named_params = 0
for name, param in model.named_parameters():
    num_named_params += 1
    if param.requires_grad:
        requires_grad.append(name)
    else:
        frozen_params.append(name)

for name, param in model.named_buffers():
    frozen_params.append(name)

# onnx_model = onnx.load("tinyllama.onnx")
onnx_model = onnx.load("distilgpt2.onnx")

artifacts.generate_artifacts(
    onnx_model,
    requires_grad=requires_grad,
    frozen_params=frozen_params,
    optimizer=artifacts.OptimType.AdamW,
    # loss=artifacts.LossType.CrossEntropyLoss,
)



args ('loss', 'logits', 'value.3', 'key.11', 'value.11', 'key.19', 'value.19', 'key.27', 'value.27', 'key.35', 'value.35', 'key.43', 'value.43')
args after edit ['loss', 'logits']


InferenceError: [ShapeInferenceError] (op_type:SoftmaxCrossEntropyLoss, node name: onnx::SoftmaxCrossEntropyLoss::14): labels typestr: Tind, has unsupported type: tensor(float)

## Data pre-processing

In [3]:
import numpy as np
import pandas as pd
from datasets import Dataset

In [4]:
df = pd.read_csv('../data/dialogueText.csv')

In [5]:
current_convo =[]
convos = []
current_user = ""
current_responder = ""

for ind in df.index:
    if len(current_user) == 0 or str(df['from'][ind]) == current_user:
        # first convo OR continuing current user
        current_user = str(df['from'][ind])
        current_convo.append(
            {
                "role": "user", 
                "content": str(df['text'][ind])
            }
        )
    elif str(df['from'][ind]) != current_user and str(df['from'][ind]) != current_responder and len(current_responder) != 0:
        # new user & new convo
        convos.append(current_convo)
        current_convo = []
        current_user = str(df['from'][ind])
        current_responder = ""
        current_convo.append(
            {
                "role": "user", 
                "content": str(df['text'][ind])
            }
        )
    elif str(df['from'][ind]) != current_user:
        if str(df['from'][ind]) != current_responder:
            current_responder = str(df['from'][ind])
        current_convo.append(
            {
                "role": "assistant", 
                "content": str(df['text'][ind])
            }
        )
    else:
        print("ERROR: fall through -- you missed a case!", df['from'][ind], current_user, current_responder)

In [6]:
llama_tokenizer = transformers.AutoTokenizer.from_pretrained("Xenova/TinyLlama-1.1B-Chat-v1.0")

In [7]:
dataset_dict = Dataset.from_dict({"chat": convos})
templated_convos_2 = dataset_dict.map(lambda x: {"formatted_chat": llama_tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})

Map: 100%|██████████| 345692/345692 [00:37<00:00, 9197.05 examples/s]


In [53]:
data_file = open("formatted_chat_500_convos.txt", "w")

data_file.writelines(templated_convos_2["formatted_chat"])

data_file.close()

## Test training artifacts by using the Python training API

In [8]:
## test generated artifacts
# create checkpoint state

state = ort_api.CheckpointState.load_checkpoint("checkpoint")

training_model = ort_api.Module('training_model.onnx', state, 'eval_model.onnx')

optimizer = ort_api.Optimizer('optimizer_model.onnx', training_model)

In [9]:
tokenizer.pad_token = tokenizer.eos_token

In [10]:
tokenized_dataset = templated_convos_2.map(lambda x: tokenizer(x["formatted_chat"], max_length=1024, padding='max_length', truncation=True, return_tensors='np'))

Map: 100%|██████████| 345692/345692 [03:46<00:00, 1524.20 examples/s]


In [11]:
tokenized_dataset = tokenized_dataset.remove_columns(['chat', 'formatted_chat'])

In [25]:
dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=8)

In [26]:
for batch in dataloader:
    print(np.transpose(np.array(batch["input_ids"][0])).shape)
    break

(8, 1024)


In [27]:
def train(epoch):
    training_model.train()
    losses = []
    i = 0
    for batch in dataloader:
        print(i, 'out of ', len(dataloader))
        # input_ids = np.transpose(np.array(batch["input_ids"][0]))
        input_ids = np.array(batch["input_ids"][0])
        attention_mask = np.array(batch["attention_mask"][0])
        # attention_mask = np.transpose(np.array(batch["attention_mask"][0]))
        print('input ids shape', input_ids.shape)
        print('attention_mask shape', attention_mask.shape)
        forward_inputs = [input_ids, attention_mask]
        loss, _ = training_model(*forward_inputs)
        print('after training')
        optimizer.step()
        print('after optimizing')
        training_model.lazy_reset_grad()
        losses.append(loss.item())
        print(loss)
        i += 1

In [28]:
for epoch in range(3):
    train(epoch)

0 out of  43212
input ids shape (1024, 8)
attention_mask shape (1024, 8)


RuntimeError: C:\a\_work\1\s\orttraining\orttraining\training_api\module.cc:538 onnxruntime::training::api::Module::TrainStep [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Reshape node. Name:'/model/Reshape_2_Grad/Reshape_1' Status Message: C:\a\_work\1\s\onnxruntime\core\providers\cpu\tensor\reshape_helper.h:45 onnxruntime::ReshapeHelper::ReshapeHelper input_shape_size == size was false. The input tensor cannot be reshaped to the requested shape. Input shape:{}, requested shape:{1024,8,768}

