In [1]:
import torch
import torch.nn as nn
import numpy as np
import onnx
import onnxruntime
from onnxruntime.training import artifacts
import torch.nn.functional as F


In [2]:
torch.__version__, onnx.__version__, onnxruntime.__version__

('2.1.0', '1.14.1', '1.16.3')

In [3]:
def generate_training_data(data_size):
    # Generate random integers for input data X
    X = np.random.uniform(0.0, 10.0, (data_size, 6))

    # Compute output data y
    y1 = (np.sum(X, axis=1) / 20)
    y1 = y1.astype(int)

    y2 = (np.sum(X, axis=1) / 10)
    y2 = y2.astype(int)
    # Compute Y as the sum of each row in X divided by 3

    return torch.tensor(X, dtype=torch.float32), torch.tensor(y1, dtype=torch.long), torch.tensor(y2, dtype=torch.long)

In [8]:
class LSTMNumberPredictor(nn.Module):
    def __init__(self, num_classes1, num_classes2, hidden_dim, num_layers):
        super(LSTMNumberPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc1 = nn.Linear(hidden_dim, num_classes1)
        self.fc2 = nn.Linear(hidden_dim, num_classes2)
    
    def forward(self, x):
        # Initializing hidden state for first input
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_dim)
        # out, _ = self.lstm(x)
        
        # Decode the hidden state of the last time step
        out1 = self.fc1(out[:, -1, :])
        out2 = self.fc2(out[:, -1, :])
        return out1, out2

# Example usage:
model = LSTMNumberPredictor(num_classes1=3, num_classes2=6, hidden_dim=50, num_layers=1)

In [9]:
import torch.optim as optim

# Parameters
data_size = 500  # Length of the subsequences

# Generate data
X, y1, y2 = generate_training_data(data_size)

X_train = X[:400]
y_train1 = y1[:400]
y_train2 = y2[:400]
X_test = X[400:]
y_test1 = y1[400:]
y_test2 = y2[400:]

In [10]:
learning_rate = 0.001
num_epochs = 300     # Number of epochs for training


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [11]:
# Reshape input suitable for LSTM (batch_size, seq_length, input_size)
X_train = X_train.unsqueeze(-1)
X_test = X_test.unsqueeze(-1)


# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs1, outputs2 = model(X_train)
    loss_train = criterion(outputs1, y_train1) + criterion(outputs2, y_train2)

    outputs_test1, outputs_test2 = model(X_test)
    loss_test = criterion(outputs_test1, y_test1) + criterion(outputs_test2, y_test2)

    # Backward and optimize
    loss_train.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss_train.item():.4f}, Testing Loss: {loss_test.item():.4f}')


Epoch [100/300], Training Loss: 1.3704, Testing Loss: 1.4296
Epoch [200/300], Training Loss: 0.5470, Testing Loss: 0.5380
Epoch [300/300], Training Loss: 0.2967, Testing Loss: 0.3233


In [12]:
example_input_for_trace = X_train[:1]


In [13]:

# Define input / output names
input_names = ["seq_input"]
output_names = ["my_output1", "my_output2"]

# Convert the PyTorch model to ONNX
torch.onnx.export(model,
                  (example_input_for_trace,),
                  "lstm_model.onnx",
                  verbose=False,
                  input_names=input_names,
                  output_names=output_names,
                  dynamic_axes={'seq_input' : {0: 'batch'},    # variable length axes
                                'my_output' : {0: 'batch'},
                                'my_output2' : {0: 'batch'}},
                                )



In [14]:
import onnx
import onnxruntime as ort

# Load the ONNX model
model = onnx.load("lstm_model.onnx")
onnx.checker.check_model(model)

ort_session = ort.InferenceSession("lstm_model.onnx")

seq = example_input_for_trace[:1].numpy()

onnx_pred = ort_session.run(
    ["my_output1", "my_output2"],
    {"seq_input": seq},
)

In [15]:
onnx_pred

[array([[-3.696101 ,  2.9922402, -0.6057701]], dtype=float32),
 array([[-3.5088613 , -3.4856412 ,  0.40965787,  4.0779767 ,  0.92218184,
         -2.6109126 ]], dtype=float32)]

In [17]:
# Sample input for the model
sample_input = X_train[:3]

In [18]:

# Assuming the path to the forward-only ONNX model is provided
path_to_forward_only_onnx_model = 'lstm_model.onnx'

# Load the forward-only ONNX model
model = onnx.load(path_to_forward_only_onnx_model)

# Extract model's parameters
all_params = [param.name for param in model.graph.initializer]

In [19]:
model.graph.output

[name: "my_output1"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "Gemmmy_output1_dim_0"
      }
      dim {
        dim_value: 3
      }
    }
  }
}
, name: "my_output2"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_value: 6
      }
    }
  }
}
]

In [20]:
all_params

['fc1.weight',
 'fc1.bias',
 'fc2.weight',
 'fc2.bias',
 'onnx::LSTM_112',
 'onnx::LSTM_113',
 'onnx::LSTM_114']

In [21]:
import onnxruntime.training.onnxblock as onnxblock
from onnxruntime.training import artifacts

class WeightedAverageLoss(onnxblock.Block):
    def __init__(self):
        self._loss1 = onnxblock.loss.CrossEntropyLoss()
        self._loss2 = onnxblock.loss.CrossEntropyLoss()
        self._w1 = onnxblock.blocks.Constant(0.9)
        self._w2 = onnxblock.blocks.Constant(0.0)
        self._add = onnxblock.blocks.Add()
        self._mul = onnxblock.blocks.Mul()

    def build(self, loss_input_name1, loss_input_name2):
        # The build method defines how the block should be stacked on top of
        # loss_input_name1 and loss_input_name2

        # Returns weighted average of the two losses
        print('loss_input_name1', '--------------', loss_input_name1)
        print('loss_input_name2', '--------------', loss_input_name2)
        return self._add(
            self._mul(self._w1(), self._loss1(loss_input_name1, labels_name="target1")),
            self._mul(self._w2(), self._loss2(loss_input_name2, labels_name="target2"))
        )


# class LSTMWithLoss(onnxblock.TrainingBlock):
#     def __init__(self):
#         super().__init__()
#         self.loss = WeightedAverageLoss()

#     def build(self, output_name1, output_name2):
#         return self.loss(output_name1, output_name2)
    
# lstm_with_loss = LSTMWithLoss()
    
 # define the loss function
class CustomCELoss(onnxblock.Block):
     def __init__(self):
         super().__init__()
         self.celoss = onnxblock.loss.CrossEntropyLoss()

     def build(self, logits, *args):
         return self.celoss(logits)


class LSTMWithLoss(onnxblock.TrainingBlock):
    def __init__(self):
        super().__init__()
        self.loss = onnxblock.loss.CrossEntropyLoss()

    def build(self, output_name):
        return self.loss(output_name)
    
lstm_with_loss = LSTMWithLoss()
lstm_with_loss.requires_grad

<bound method TrainingBlock.requires_grad of <__main__.LSTMWithLoss object at 0x16a573ee0>>

In [22]:
# name_list_loss = [output.name for output in model.graph.output]

In [23]:
# name_list_loss

In [24]:
# with onnxblock.base(model):
 
#     # _ = lstm_with_loss(*[output.name for output in model.graph.output])
#     _ = lstm_with_loss(*['my_output'])
#     # eval_model = model_accessor.eval_model
#     training_model, eval_model = lstm_with_loss.to_model_proto()

In [25]:
# lstm_with_loss.parameters()

In [26]:
# # Build the optimizer graph
# optimizer = onnxblock.optim.AdamW()
# # with onnxblock.onnx_model() as accessor:
# with onnxblock.empty_base() as accessor:
#     _ = optimizer(lstm_with_loss.parameters())
#     # optimizer_model = accessor.model
#     optimizer_model = optimizer.to_model_proto()


# path_to_the_checkpoint_artifact = 'training_artifacts/checkpoint'
# path_to_the_training_model = 'training_artifacts/training_model.onnx'
# path_to_the_eval_model = 'training_artifacts/eval_model.onnx'
# path_to_the_optimizer_model = 'training_artifacts/optimizer_model.onnx'
# # Create the training artifacts
# onnx.save(training_model, path_to_the_training_model)

# onnx.save(eval_model, path_to_the_eval_model)

# onnx.save(optimizer_model, path_to_the_optimizer_model)

# trainable_params, non_trainable_params = lstm_with_loss.parameters()
# onnxblock.save_checkpoint((trainable_params, non_trainable_params), path_to_the_checkpoint_artifact)

In [27]:
# training_model.graph

In [28]:
trainable_layers = ['fc', 'onnx']
requires_grad = [param for param in all_params if any(layer in param for layer in trainable_layers)]
frozen_params = [param for param in all_params if param not in requires_grad]


In [29]:
requires_grad, frozen_params

(['fc1.weight',
  'fc1.bias',
  'fc2.weight',
  'fc2.bias',
  'onnx::LSTM_112',
  'onnx::LSTM_113',
  'onnx::LSTM_114'],
 [])

In [30]:
model.graph.output

[name: "my_output1"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "Gemmmy_output1_dim_0"
      }
      dim {
        dim_value: 3
      }
    }
  }
}
, name: "my_output2"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_value: 6
      }
    }
  }
}
]

In [31]:

# Generate the training artifacts
path_to_output_artifact_directory = 'training_artifacts'
artifacts.generate_artifacts(model,
                             requires_grad=requires_grad,
                             frozen_params=frozen_params,
                            #  loss=artifacts.LossType.CrossEntropyLoss,
                            # loss = CustomCELoss(),
                            loss = WeightedAverageLoss(),
                             optimizer=artifacts.OptimType.AdamW,
                             artifact_directory=path_to_output_artifact_directory)

# Return the path to output artifact directory
path_to_output_artifact_directory

2023-12-25 21:31:38,237 root [INFO] - Custom loss block provided: WeightedAverageLoss
2023-12-25 21:31:38,239 root [DEBUG] - Building training block _TrainingBlock
2023-12-25 21:31:38,240 root [DEBUG] - Building block: WeightedAverageLoss
2023-12-25 21:31:38,240 root [DEBUG] - Building block: Constant
2023-12-25 21:31:38,244 root [DEBUG] - Building block: CrossEntropyLoss
2023-12-25 21:31:38,245 root [DEBUG] - Building block: Mul
2023-12-25 21:31:38,247 root [DEBUG] - Building block: Constant
2023-12-25 21:31:38,248 root [DEBUG] - Building block: CrossEntropyLoss
2023-12-25 21:31:38,249 root [DEBUG] - Building block: Mul
2023-12-25 21:31:38,250 root [DEBUG] - Building block: Add
2023-12-25 21:31:38,255 root [DEBUG] - Building gradient graph for training block _TrainingBlock
2023-12-25 21:31:38.260842 [I:onnxruntime:Default, constant_sharing.cc:256 ApplyImpl] Total shared scalar initializer count: 1
2023-12-25 21:31:38,264 root [DEBUG] - The loss output is onnx::add_output::15. The grad

loss_input_name1 -------------- my_output1
loss_input_name2 -------------- my_output2


'training_artifacts'

In [32]:
from onnxruntime.training.api import CheckpointState, Module, Optimizer
import torch

# Assuming the paths are correctly defined
path_to_the_checkpoint_artifact = 'training_artifacts/checkpoint'
path_to_the_training_model = 'training_artifacts/training_model.onnx'
path_to_the_eval_model = 'training_artifacts/eval_model.onnx'
path_to_the_optimizer_model = 'training_artifacts/optimizer_model.onnx'

# Load the checkpoint state
state = CheckpointState.load_checkpoint(path_to_the_checkpoint_artifact)

# Create the module
module = Module(path_to_the_training_model,
                state,
                path_to_the_eval_model,
                device="cpu")

optimizer = Optimizer(path_to_the_optimizer_model, module)

In [33]:
import numpy as np

X, y1, y2 = generate_training_data(500)

X_train = X.detach().numpy().reshape(-1, 6, 1)
y_train1 = y1.detach().numpy()
y_train2 = y2.detach().numpy()


In [34]:

# Define epochs and batch size
epochs = 100
batch_size = 8  # You can adjust the batch size as needed

# Training loop

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        # Extract batches
        batch_X = X_train[i:i + batch_size]
        batch_y1 = y_train1[i:i + batch_size]
        batch_y2 = y_train2[i:i + batch_size]

        # Set the module to training mode
        module.train()

        # Forward pass (assuming the module accepts input and target)
        training_loss = module(batch_X, batch_y1, batch_y2)

        # Backward pass and optimization
        optimizer.step()

        # Reset gradients
        module.lazy_reset_grad()

    # Print epoch statistics, etc.
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {training_loss}')


Epoch 10/100, Training Loss: [0.00605627]
Epoch 20/100, Training Loss: [0.00095272]
Epoch 30/100, Training Loss: [0.00043647]
Epoch 40/100, Training Loss: [0.00013762]
Epoch 50/100, Training Loss: [6.466161e-05]
Epoch 60/100, Training Loss: [4.309988e-05]
Epoch 70/100, Training Loss: [3.0173167e-05]
Epoch 80/100, Training Loss: [1.4617692e-05]
Epoch 90/100, Training Loss: [1.7890048e-05]
Epoch 100/100, Training Loss: [1.3571763e-05]


In [47]:
CheckpointState.save_checkpoint(state, path_to_the_checkpoint_artifact)


In [35]:
module.output_names()

['onnx::add_output::15']

In [37]:
# Assuming 'output' is the name of your model's output
output_names = ['my_output1', 'my_output2']  # List of output names

# Export the model for inferencing
module.export_model_for_inferencing('inference_model.onnx', output_names)


In [38]:
import onnxruntime

ort_session = onnxruntime.InferenceSession("inference_model.onnx", providers=["CPUExecutionProvider"])
# ort_session = onnxruntime.InferenceSession("classification_model.onnx", providers=["CPUExecutionProvider"])


ort_inputs = {ort_session.get_inputs()[0].name: X_train[0:5]}

ort_outs = ort_session.run(None, ort_inputs)

In [40]:
ort_outs[0].argmax(axis=1)

array([1, 0, 1, 2, 1])

In [41]:
y_train1[0:5]

array([1, 0, 1, 2, 1])

In [42]:
ort_outs[1].argmax(axis=1)

array([3, 1, 3, 4, 3])

In [43]:
y_train2[0:5]

array([2, 1, 3, 4, 3])