# LSTM Training with PyTorch

- https://arxiv.org/pdf/1605.06921.pdf
- https://kcimc.medium.com/discrete-figures-7d9e9c275c47
- [BVH File Format](https://research.cs.wisc.edu/graphics/Courses/cs-838-1999/Jeff/BVH.html)


In [1]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 4090 (UUID: GPU-59ba7a4d-461d-6c44-7eea-a4200c322183)


In [2]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [5]:
!mkdir -p train_data
!curl -o train_data/flute2.bvh https://enigmeta.s3.amazonaws.com/2023-hands/mocap/Flute2Slower_mixamo.bvh
!shasum train_data/flute2.bvh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 26.9M  100 26.9M    0     0  5917k      0  0:00:04  0:00:04 --:--:-- 6666k
9ce72d7339c5b93d89952168f0d82aeaf62f4480  train_data/flute2.bvh


In [3]:
def find_line(lines, prefix):
  for line in lines:
    if line.startswith('Frames:'):
      return line

In [4]:
def read_bvh_file(file_path):
    # Read file contents
    with open(file_path, 'r') as f:
        file_contents = f.read()
        
    # Split file contents by newline characters
    lines = file_contents.split('\n')

    # Find the channel names
    channel_names = []
    joint_name = None
    for line in lines:
      line = line.strip()
      if line.startswith('JOINT') or line.startswith('ROOT'):
        # Joint line looks like this:
        # JOINT Spine2
        joint_data = line.split(' ')
        joint_name = joint_data[1]
      if line.startswith('CHANNELS'):
        # Channels line looks like this:
        # CHANNELS 3 Yrotation Xrotation Zrotation
        channel_data = line.split(' ')
        for channel in channel_data[2:]:
          channel_names.append(f'{joint_name}_{channel}')
    
    # Find the number of frames and the start of the motion data
    num_frames_line = find_line(lines, 'Frames:')
    num_frames = int(num_frames_line.split(' ')[1])
    motion_data_index = lines.index('MOTION') + 3
    header = '\n'.join(lines[:motion_data_index])
    
    # Find the number of channels in the file
    first_frame_data = lines[motion_data_index].strip().split(' ')
    num_channels = len(first_frame_data)
    print('Channels:', num_channels)

    # Extract the motion data as a string
    motion_data_str = ''.join(lines[motion_data_index:motion_data_index+num_frames])
    
    # Convert the motion data to a numpy array
    motion_data = np.fromstring(motion_data_str, sep=' ')
    motion_data = motion_data.reshape((num_frames, -1))
    
    # Convert the numpy array to a PyTorch tensor
    motion_tensor = torch.tensor(motion_data, dtype=torch.float32)
    
    return motion_tensor, header, channel_names

In [6]:
raw_data, header, channel_names = read_bvh_file('train_data/flute2.bvh')
print(raw_data.shape)
print(channel_names)

Channels: 183
torch.Size([19298, 183])
['Hips_Xposition', 'Hips_Yposition', 'Hips_Zposition', 'Hips_Yrotation', 'Hips_Xrotation', 'Hips_Zrotation', 'Spine_Yrotation', 'Spine_Xrotation', 'Spine_Zrotation', 'Spine1_Yrotation', 'Spine1_Xrotation', 'Spine1_Zrotation', 'Spine2_Yrotation', 'Spine2_Xrotation', 'Spine2_Zrotation', 'LeftShoulder_Yrotation', 'LeftShoulder_Xrotation', 'LeftShoulder_Zrotation', 'LeftArm_Yrotation', 'LeftArm_Xrotation', 'LeftArm_Zrotation', 'LeftForeArm_Yrotation', 'LeftForeArm_Xrotation', 'LeftForeArm_Zrotation', 'LeftHand_Yrotation', 'LeftHand_Xrotation', 'LeftHand_Zrotation', 'LeftInHandIndex_Yrotation', 'LeftInHandIndex_Xrotation', 'LeftInHandIndex_Zrotation', 'LeftHandIndex1_Yrotation', 'LeftHandIndex1_Xrotation', 'LeftHandIndex1_Zrotation', 'LeftHandIndex2_Yrotation', 'LeftHandIndex2_Xrotation', 'LeftHandIndex2_Zrotation', 'LeftHandIndex3_Yrotation', 'LeftHandIndex3_Xrotation', 'LeftHandIndex3_Zrotation', 'LeftInHandMiddle_Yrotation', 'LeftInHandMiddle_Xrotat

In [9]:
# Save the data back to CSV format
#np.savetxt('flute2.csv', raw_data, delimiter=',', header=','.join(channel_names), comments='', fmt='%.5f')

In [10]:
class BVHDataset(Dataset):
    def __init__(self, file_path, input_size, output_size, seq_length, future_delta):
        self.file_path = file_path
        self.input_size = input_size
        self.output_size = output_size
        self.seq_length = seq_length
        self.future_delta = future_delta

        # Read BVH file
        self.motion_tensor, self.header, self.channel_names = read_bvh_file(file_path)

        # Compute the total number of sequences in the file
        self.total_sequences = len(self.motion_tensor) - self.seq_length - self.future_delta

        # Compute input_mean and input_std
        self.input_mean = torch.mean(self.motion_tensor, dim=(0,))
        self.input_std = torch.std(self.motion_tensor, dim=(0,))
        self.input_std = torch.where(self.input_std == 0, torch.tensor(1e-7), self.input_std)

    def __len__(self):
        return self.total_sequences

    def __getitem__(self, idx):
        # Compute the sequence index for the given index
        seq_idx = idx + self.seq_length

        # Get the sequence of length seq_length as input x
        input_tensor = self.motion_tensor[seq_idx - self.seq_length:seq_idx, :self.input_size]

        # Get the frame future_delta frames into the future as output y
        output_tensor = self.motion_tensor[seq_idx + self.future_delta, :self.output_size]

        # Normalize input and output tensors
        input_tensor = (input_tensor - self.input_mean) / self.input_std
        output_tensor = (output_tensor - self.input_mean[:self.output_size]) / self.input_std[:self.output_size]

        return input_tensor, output_tensor


In [11]:
import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(LSTM, self).__init__()        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()        
                
    def forward(self, x):
        # x shape: [batch_size, seq_len, input_size]

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Linear output
        output = self.fc(lstm_out[:, -1, :]) # get the last output from the LSTM sequence
        output = self.relu(output)   

        return output


In [12]:
# Define hyperparameters
num_channels = raw_data.shape[-1]
input_size = num_channels
hidden_size = 256
output_size = num_channels
batch_size = 32
learning_rate = 0.001
seq_length = 100
future_delta = 200

# Define model, loss function, and optimizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LSTM(input_size=input_size, hidden_size=hidden_size, output_size=output_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# Create data loaders
train_dataset = BVHDataset('train_data/flute2.bvh', input_size, output_size, seq_length, future_delta)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Channels: 183


In [13]:
# Train the model
num_epochs = 100
for epoch in range(num_epochs):
    print('Epoch: %3d' % (epoch + 1), end="")
    running_loss = 0.0
    for i, data in enumerate(train_loader):
        # Get the inputs and targets
        inputs, targets = data
        inputs = inputs.to(device)
        targets = targets.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, targets)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:  # print a dot after every 100 mini-batches
            print(".", end="")
            running_loss = 0.0

    # Print epoch statistics
    print(' [Epoch %d] Loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))
    running_loss = 0.0



Epoch:   1..... [Epoch 1] Loss: 0.065
Epoch:   2..... [Epoch 2] Loss: 0.072
Epoch:   3..... [Epoch 3] Loss: 0.071
Epoch:   4..... [Epoch 4] Loss: 0.068
Epoch:   5..... [Epoch 5] Loss: 0.070
Epoch:   6..... [Epoch 6] Loss: 0.066
Epoch:   7..... [Epoch 7] Loss: 0.067
Epoch:   8..... [Epoch 8] Loss: 0.075
Epoch:   9..... [Epoch 9] Loss: 0.065
Epoch:  10..... [Epoch 10] Loss: 0.065
Epoch:  11..... [Epoch 11] Loss: 0.065
Epoch:  12..... [Epoch 12] Loss: 0.064
Epoch:  13..... [Epoch 13] Loss: 0.064
Epoch:  14..... [Epoch 14] Loss: 0.065
Epoch:  15..... [Epoch 15] Loss: 0.065
Epoch:  16..... [Epoch 16] Loss: 0.065
Epoch:  17..... [Epoch 17] Loss: 0.063
Epoch:  18..... [Epoch 18] Loss: 0.065
Epoch:  19..... [Epoch 19] Loss: 0.068
Epoch:  20..... [Epoch 20] Loss: 0.063
Epoch:  21..... [Epoch 21] Loss: 0.068
Epoch:  22..... [Epoch 22] Loss: 0.063
Epoch:  23..... [Epoch 23] Loss: 0.066
Epoch:  24..... [Epoch 24] Loss: 0.063
Epoch:  25..... [Epoch 25] Loss: 0.066
Epoch:  26..... [Epoch 26] Loss: 0

In [14]:
# Save the model
torch.save(model.state_dict(), 'my_model.pt')

In [15]:
def write_bvh_file(file_name, header, predicted_motion):
    # Open the file for writing
    with open(file_name, 'w') as f:
        # Write the header
        f.write(header)
        f.write('\n')
        # Write the motion data
        for frame in predicted_motion:
            frame_str = ' '.join(str(x) for x in frame)
            f.write(frame_str + '\n')


In [16]:
# Generate predicted motion sequence
with torch.no_grad():
    model.eval() # set the model to evaluation mode
    predicted_motion = [] # initialize list for predicted motion
    # Use the last frame in the training data as the starting point for the prediction
    current_frames = train_dataset.motion_tensor[-seq_length:]
    # Normalize the starting frame
    current_frames = (current_frames - train_dataset.input_mean) / train_dataset.input_std
    for i in range(1000):
        # Reshape the current frame for the model
        input_tensor = torch.Tensor(current_frames).unsqueeze(0).to(device)
        print(input_tensor.shape)
        # Get the predicted frame from the model
        output_tensor = model(input_tensor).squeeze(0)
        # Unnormalize the predicted frame
        output_tensor = output_tensor.cpu() * train_dataset.input_std + train_dataset.input_mean
        # Add the predicted frame to the list
        predicted_motion.append(output_tensor.numpy())
        # Update the current frame
        current_frames = np.concatenate((current_frames[1:], output_tensor[None, :]), axis=0)
        current_frames = current_frames[-100:]
        #current_frames = np.concatenate((current_frame[1:], output_tensor.cpu().numpy()), axis=0)
    # Convert the predicted motion to a numpy array
    predicted_motion = np.array(predicted_motion)


torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([

torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([

torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([1, 100, 183])
torch.Size([

In [17]:
predicted_motion.shape

(1000, 183)

In [18]:
# Save predicted motion to BVH file
write_bvh_file('out.bvh', train_dataset.header, predicted_motion)
