In [1]:
import torch
import torch.nn as nn
import numpy as np
import onnx
from onnxruntime.training import artifacts
import torch.nn.functional as F


In [2]:
torch.__version__, onnx.__version__

('2.1.0', '1.14.1')

In [3]:
import onnxruntime
onnxruntime.__version__

'1.16.3'

In [4]:

class LSTMNumberPredictor(nn.Module):
    def __init__(self, num_classes, hidden_dim, num_layers):
        super(LSTMNumberPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # LSTM layer
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        # Initializing hidden state for first input
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_dim)
        # out, _ = self.lstm(x)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Example usage:
model = LSTMNumberPredictor(num_classes=3, hidden_dim=50, num_layers=1)

In [5]:

# class GRUNumberPredictor(nn.Module):
#     def __init__(self, num_classes, hidden_dim, num_layers):
#         super(GRUNumberPredictor, self).__init__()
#         self.hidden_dim = hidden_dim
#         self.num_layers = num_layers

#         # GRU layer
#         self.gru = nn.GRU(input_size=1, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
        
#         # Fully connected layer
#         self.fc = nn.Linear(hidden_dim, num_classes)
    
#     def forward(self, x):
#         # Initializing hidden state for first input
#         h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        
#         # Forward propagate GRU
#         out, _ = self.gru(x, h0)  # out: tensor of shape (batch_size, seq_length, hidden_dim)
        
#         # Decode the hidden state of the last time step
#         out = self.fc(out[:, -1, :])
#         return out

# # Example usage:
# model = GRUNumberPredictor(num_classes=3, hidden_dim=50, num_layers=1)


In [6]:


# class FCSequencePredictor(nn.Module):
#     def __init__(self, input_size, hidden_dim, num_classes):
#         super(FCSequencePredictor, self).__init__()
        
#         self.embedding = nn.Embedding(num_classes, 2)
#         # Fully connected layers
#         self.fc1 = nn.Linear(2*16, hidden_dim)
#         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
#         self.fc3 = nn.Linear(hidden_dim, num_classes)
    
#     def forward(self, x):
#         # Flatten the sequence
#         x = x.view(x.size(0), -1)
#         x = self.embedding(x)
#         x = x.view(x.size(0), -1)

#         # Forward through fully connected layers
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         return self.fc3(x)


# # Example usage:
# # Assuming each number in the sequence is represented by a single feature
# sequence_length = 16
# num_features_per_number = 1
# input_size = sequence_length * num_features_per_number
# hidden_dim = 50
# num_classes = 3

# model = FCSequencePredictor(input_size, hidden_dim, num_classes)

In [7]:
def generate_sequence_data(sequence, sequence_length=16):
    """
    Generates training data from a sequence for a model, ensuring all sequences grow in length
    and are of a fixed length by padding with zeros.

    Parameters:
    sequence (list): The full sequence of numbers.
    sequence_length (int): Fixed length of the subsequences to be generated (default is 16).

    Returns:
    torch.Tensor: Generated subsequences of fixed length.
    torch.Tensor: Labels for each subsequence.
    """
    subsequences = []
    labels = []

    for i in range(1, len(sequence) + 1):
        # Take the first i elements of the sequence and pad with zeros to make the length sequence_length
        subsequence = [0] * (sequence_length - i) + sequence[:i]
        label = sequence[i] if i < len(sequence) else sequence[0]

        subsequences.append(subsequence)
        labels.append(label)

    return torch.tensor(subsequences, dtype=torch.float32), torch.tensor(labels, dtype=torch.long)


In [8]:
import torch.optim as optim

# Parameters
sequence_length = 16  # Length of the subsequences
learning_rate = 0.001
num_epochs = 100     # Number of epochs for training


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Example sequence
full_sequence = [0, 1, 2, 0, 1, 2, 0, 1, 2]

# Generate data
X, y = generate_sequence_data(full_sequence, sequence_length)


In [9]:
X.dtype, y.dtype

(torch.float32, torch.int64)

In [10]:
X.shape

torch.Size([9, 16])

In [11]:

# Reshape input suitable for LSTM (batch_size, seq_length, input_size)
X = X.unsqueeze(-1)
X = X.to('cpu')
model.to('cpu')

LSTMNumberPredictor(
  (lstm): LSTM(1, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=3, bias=True)
)

In [12]:

# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(X)
    loss = criterion(outputs, y)

    # Backward and optimize
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [10/100], Loss: 1.0832
Epoch [20/100], Loss: 1.0664
Epoch [30/100], Loss: 1.0419
Epoch [40/100], Loss: 1.0035
Epoch [50/100], Loss: 0.9690
Epoch [60/100], Loss: 0.9164
Epoch [70/100], Loss: 0.8341
Epoch [80/100], Loss: 0.7342
Epoch [90/100], Loss: 0.6069
Epoch [100/100], Loss: 0.4682


In [13]:
import torch.onnx

# Set the model to evaluation mode
model.eval()

# Sample input for the model
sample_input = X[:3]

In [14]:
# Export the model to ONNX format
onnx_model_path = 'classification_model.onnx'
torch.onnx.export(model, sample_input, onnx_model_path, export_params=True, opset_version=17, 
                  do_constant_folding=True, input_names=['input'], output_names=['output'],
                 dynamic_axes={'input' : {0: 'batch'},    # variable length axes
                                'output' : {0: 'batch'}})

onnx_model_path



'classification_model.onnx'

In [15]:

model = onnx.load("classification_model.onnx")
onnx.checker.check_model(model)

In [16]:
import onnxruntime

ort_session = onnxruntime.InferenceSession("classification_model.onnx", providers=["CPUExecutionProvider"])

ort_inputs = {ort_session.get_inputs()[0].name: sample_input.detach().numpy()}

ort_outs = ort_session.run(None, ort_inputs)

In [17]:
sample_input, ort_outs

(tensor([[[0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.]],
 
         [[0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [1.]],
 
         [[0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [0.],
          [1.],
          [2.]]]),
 [array([[-0.6776786 ,  0.7952337 ,  0.60444725],
         [-0.02636533,  0.19971812,  0.42790872],
         [ 0.69669235, -0.132533  , -0.2734946 ]], dtype=float32)])

In [18]:


# Assuming the path to the forward-only ONNX model is provided
path_to_forward_only_onnx_model = 'classification_model.onnx'

# Load the forward-only ONNX model
model = onnx.load(path_to_forward_only_onnx_model)

# Extract model's parameters
all_params = [param.name for param in model.graph.initializer]

In [19]:
all_params

['fc.weight', 'fc.bias', 'onnx::LSTM_109', 'onnx::LSTM_110', 'onnx::LSTM_111']

In [20]:
import onnxruntime.training.onnxblock as onnxblock
from onnxruntime.training import artifacts

#  # define the loss function
#  class CustomCELoss(onnxblock.Block):
#      def __init__(self):
#          super().__init__()
#          self.celoss = onnxblock.loss.MSELoss()


#      def build(self, logits, *args):
#          return self.celoss(logits)

# class MNISTWithLoss(onnxblock.TrainingBlock):
#     def __init__(self):
#         super().__init__()
#         self.loss = onnxblock.loss.CrossEntropyLoss()

#     def build(self, output_name):
#         return self.loss(output_name)
    
# mnist_with_loss = MNISTWithLoss()

In [21]:
# with onnxblock.base(model):
 
#     # _ = mnist_with_loss(*[output.name for output in model.graph.output])
#     mnist_with_loss(*[output.name for output in model.graph.output])
#     # eval_model = model_accessor.eval_model
#     training_model, eval_model = mnist_with_loss.to_model_proto()

In [22]:
trainable_layers = ['fc', 'onnx']
requires_grad = [param for param in all_params if any(layer in param for layer in trainable_layers)]
frozen_params = [param for param in all_params if param not in requires_grad]


In [23]:
requires_grad, frozen_params

(['fc.weight',
  'fc.bias',
  'onnx::LSTM_109',
  'onnx::LSTM_110',
  'onnx::LSTM_111'],
 [])

In [24]:
model.graph.output

[name: "output"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "batch"
      }
      dim {
        dim_value: 3
      }
    }
  }
}
]

In [25]:

# Generate the training artifacts
path_to_output_artifact_directory = 'training_artifacts'
artifacts.generate_artifacts(model,
                             requires_grad=requires_grad,
                             frozen_params=frozen_params,
                             loss=artifacts.LossType.CrossEntropyLoss,
                             optimizer=artifacts.OptimType.AdamW,
                             artifact_directory=path_to_output_artifact_directory)

# Return the path to output artifact directory
path_to_output_artifact_directory

2023-12-25 14:10:47,329 root [INFO] - Loss function enum provided: CrossEntropyLoss
2023-12-25 14:10:47,331 root [DEBUG] - Building training block _TrainingBlock
2023-12-25 14:10:47,332 root [DEBUG] - Building block: CrossEntropyLoss
2023-12-25 14:10:47,335 root [DEBUG] - Building gradient graph for training block _TrainingBlock
2023-12-25 14:10:47,341 root [DEBUG] - The loss output is onnx::loss::2. The gradient graph will be built starting from onnx::loss::2_grad.
2023-12-25 14:10:47.339274 [I:onnxruntime:Default, constant_sharing.cc:256 ApplyImpl] Total shared scalar initializer count: 1
2023-12-25 14:10:47,346 root [DEBUG] - Adding gradient accumulation nodes for training block _TrainingBlock
2023-12-25 14:10:47,349 root [INFO] - Training model path training_artifacts/training_model.onnx already exists. Overwriting.
2023-12-25 14:10:47,353 root [INFO] - Saved training model to training_artifacts/training_model.onnx
2023-12-25 14:10:47,353 root [INFO] - Eval model path training_arti

'training_artifacts'

In [26]:
from onnxruntime.training.api import CheckpointState, Module, Optimizer
import torch

# Assuming the paths are correctly defined
path_to_the_checkpoint_artifact = 'training_artifacts/checkpoint'
path_to_the_training_model = 'training_artifacts/training_model.onnx'
path_to_the_eval_model = 'training_artifacts/eval_model.onnx'
path_to_the_optimizer_model = 'training_artifacts/optimizer_model.onnx'

# Load the checkpoint state
state = CheckpointState.load_checkpoint(path_to_the_checkpoint_artifact)

# Create the module
module = Module(path_to_the_training_model,
                state,
                path_to_the_eval_model,
                device="cpu")

optimizer = Optimizer(path_to_the_optimizer_model, module)

In [27]:
import numpy as np
full_sequence = [0, 1, 2, 0, 1, 2, 0, 1, 2]

# Generate data
X, y = generate_sequence_data(full_sequence, sequence_length)

X_train = X.detach().numpy().reshape(-1, 16, 1)
y_train = y.detach().numpy()


In [28]:

# Define epochs and batch size
epochs = 100
batch_size = 8  # You can adjust the batch size as needed

# Training loop

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        # Extract batches
        batch_X = X_train[i:i + batch_size]
        batch_y = y_train[i:i + batch_size]

        # Set the module to training mode
        module.train()

        # Forward pass (assuming the module accepts input and target)
        training_loss = module(batch_X, batch_y)

        # Backward pass and optimization
        optimizer.step()

        # Reset gradients
        module.lazy_reset_grad()

    # Print epoch statistics, etc.
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {training_loss}')


Epoch 10/100, Training Loss: 0.053898733109235764
Epoch 20/100, Training Loss: 0.026195356622338295
Epoch 30/100, Training Loss: 0.016082679852843285
Epoch 40/100, Training Loss: 0.011428603902459145
Epoch 50/100, Training Loss: 0.008008749224245548
Epoch 60/100, Training Loss: 0.0059097083285450935
Epoch 70/100, Training Loss: 0.004517824854701757
Epoch 80/100, Training Loss: 0.0035393708385527134
Epoch 90/100, Training Loss: 0.0028287656605243683
Epoch 100/100, Training Loss: 0.002304638037458062


In [29]:
CheckpointState.save_checkpoint(state, path_to_the_checkpoint_artifact)


In [30]:
module.output_names()

['onnx::loss::2']

In [31]:
# Assuming 'output' is the name of your model's output
output_names = ['output']  # List of output names

# Export the model for inferencing
module.export_model_for_inferencing('inference.onnx', output_names)


In [32]:
X_train[0:3], y_train[0:3]

(array([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]],
 
        [[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.]],
 
        [[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.],
         [2.]]], dtype=float32),
 array([1, 2, 0]))

In [98]:
import onnxruntime

ort_session = onnxruntime.InferenceSession("inference_model.onnx", providers=["CPUExecutionProvider"])
# ort_session = onnxruntime.InferenceSession("classification_model.onnx", providers=["CPUExecutionProvider"])


ort_inputs = {ort_session.get_inputs()[0].name: X_train[0:5]}

ort_outs = ort_session.run(None, ort_inputs)

In [99]:
ort_outs[0].argmax(axis=1)

array([1, 2, 0, 1, 2])

In [100]:
y_train[0:5]

array([1, 2, 0, 1, 2])