# Transformer Training on Pose Data

In [1]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce RTX 4090 (UUID: GPU-59ba7a4d-461d-6c44-7eea-a4200c322183)


In [2]:
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.distributions as D
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler


## Loading the Data

In [3]:
def find_line(lines, prefix):
  for line in lines:
    if line.startswith('Frames:'):
      return line

In [4]:
def read_bvh_file(file_path):
    # Read file contents
    with open(file_path, 'r') as f:
        file_contents = f.read()
        
    # Split file contents by newline characters
    lines = file_contents.split('\n')

    # Find the channel names
    channel_names = []
    joint_name = None
    for line in lines:
      line = line.strip()
      if line.startswith('JOINT') or line.startswith('ROOT'):
        # Joint line looks like this:
        # JOINT Spine2
        joint_data = line.split(' ')
        joint_name = joint_data[1]
      if line.startswith('CHANNELS'):
        # Channels line looks like this:
        # CHANNELS 3 Yrotation Xrotation Zrotation
        channel_data = line.split(' ')
        for channel in channel_data[2:]:
          channel_names.append(f'{joint_name}_{channel}')
    
    # Find the number of frames and the start of the motion data
    num_frames_line = find_line(lines, 'Frames:')
    num_frames = int(num_frames_line.split(' ')[1])
    motion_data_index = lines.index('MOTION') + 3
    header = '\n'.join(lines[:motion_data_index])
    
    # Find the number of channels in the file
    first_frame_data = lines[motion_data_index].strip().split(' ')
    num_channels = len(first_frame_data)
    print('Channels:', num_channels)

    # Extract the motion data as a string
    motion_data_str = ''.join(lines[motion_data_index:motion_data_index+num_frames])
    
    # Convert the motion data to a numpy array
    motion_data = np.fromstring(motion_data_str, sep=' ')
    motion_data = motion_data.reshape((num_frames, -1))
    
    # Convert the numpy array to a PyTorch tensor
    motion_tensor = torch.tensor(motion_data, dtype=torch.float32)
    
    return motion_tensor, header, channel_names

In [5]:
raw_data, header, channel_names = read_bvh_file('train_data/flute2.bvh')
print(raw_data.shape)
#print(channel_names)

Channels: 183
torch.Size([19298, 183])


## Building a PyTorch Dataset

In [6]:
class BVHDataset(Dataset):
    def __init__(self, file_path, input_size, output_size, seq_length, future_delta):
        self.file_path = file_path
        self.input_size = input_size
        self.output_size = output_size
        self.seq_length = seq_length
        self.future_delta = future_delta

        # Read BVH file
        self.motion_tensor, self.header, self.channel_names = read_bvh_file(file_path)

        # Compute the total number of sequences in the file
        self.total_sequences = len(self.motion_tensor) - self.seq_length - self.future_delta

        # Compute input_mean and input_std
        self.input_mean = torch.mean(self.motion_tensor, dim=(0,))
        self.input_std = torch.std(self.motion_tensor, dim=(0,))
        self.input_std = torch.where(self.input_std == 0, torch.tensor(1e-7), self.input_std)

    def __len__(self):
        return self.total_sequences

    def __getitem__(self, idx):
        # Compute the sequence index for the given index
        seq_idx = idx + self.seq_length

        # Get the sequence of length seq_length as input x
        input_tensor = self.motion_tensor[seq_idx - self.seq_length:seq_idx, :self.input_size]

        # Get the frame future_delta frames into the future as output y
        output_tensor = self.motion_tensor[seq_idx + self.future_delta, :self.output_size]

        # Normalize input and output tensors
        input_tensor = (input_tensor - self.input_mean) / self.input_std
        output_tensor = (output_tensor - self.input_mean[:self.output_size]) / self.input_std[:self.output_size]

        return input_tensor, output_tensor


In [33]:
# Create data loaders

num_channels = raw_data.shape[-1]
input_size = num_channels
output_size = num_channels
seq_length = 100
future_delta = 1
batch_size = 32


dataset = BVHDataset('train_data/flute2.bvh', input_size, output_size, seq_length, future_delta)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Channels: 183


## Defining the Model

In [42]:
import torch
import torch.nn as nn

def create_position_encoding(sequence_length, model_dim):
    position_encoding = torch.zeros(sequence_length, model_dim)
    positions = torch.arange(0, sequence_length, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, model_dim, 2).float() * (-math.log(10000.0) / model_dim))
    position_encoding[:, 0::2] = torch.sin(positions * div_term)
    position_encoding[:, 1::2] = torch.cos(positions * div_term)
    position_encoding = position_encoding.unsqueeze(0).transpose(0, 1)
    return position_encoding.to(device)


class PoseTransformer(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, output_dim, sequence_length):
        super(PoseTransformer, self).__init__()

        self.model_dim = model_dim
        self.sequence_length = sequence_length

        self.embedding = nn.Linear(input_dim, model_dim)
        self.position_encoding = create_position_encoding(sequence_length, model_dim)

        self.transformer = nn.Transformer(
            d_model=model_dim,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=model_dim * 4,
            dropout=0.1
        )
        self.fc_out = nn.Linear(model_dim, output_dim)

    def forward(self, x, tgt):
        # x shape: (batch_size, sequence_length, input_dim)
        #print("x shape:", x.shape)
        #print("self.embedding(x) shape:", self.embedding(x).shape)
        #print("self.position_encoding shape:", self.position_encoding.shape)


        x = self.embedding(x) + self.position_encoding.repeat(1, x.size(0), 1).permute(1, 0, 2)
        #x = self.embedding(x) + self.position_encoding.repeat(x.size(0), 1, 1)
        # x shape: (sequence_length, batch_size, model_dim)
        #x = x.permute(1, 0, 2)
        
        # tgt shape: (batch_size, sequence_length, input_dim)
        #tgt = self.embedding(tgt) + self.position_encoding.repeat(tgt.size(0), 1, 1)
        # tgt shape: (sequence_length, batch_size, model_dim)
        #tgt = tgt.permute(1, 0, 2)
        
        # Pass the input through the transformer
        # If tgt is None, the decoder will generate an output sequence based on the encoder's output
        output = self.transformer(x, x)

        # Output shape: (batch_size, sequence_length, input_dim)
        output = self.fc_out(output).permute(1, 0, 2)

        return output

# Hyperparameters
input_dim = 183
output_dim = 183
model_dim = 512
num_heads = 8
num_layers = 6
sequence_length = 100

# Create the model
model = PoseTransformer(input_dim, model_dim, num_heads, num_layers, output_dim, sequence_length)


In [43]:
# Loss function
criterion = nn.MSELoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning rate scheduler (optional)
scheduler = StepLR(optimizer, step_size=10, gamma=0.9)

# Training parameters
num_epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Training on {device}')

# Move the model to the device
model.to(device)

# Main training loop
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    for batch_idx, (input_seq, target_seq) in enumerate(tqdm(train_loader)):
        input_seq, target_seq = input_seq.to(device), target_seq.to(device)

        optimizer.zero_grad()

        # Pass only the input_seq to the model during training
        output = model(input_seq, input_seq)

        loss = criterion(output, target_seq)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.6f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_idx, (input_seq, target_seq) in enumerate(val_loader):
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            output = model(input_seq, input_seq)
            loss = criterion(output, target_seq)
            val_loss += loss.item()
            
        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.6f}")

    # Update learning rate
    scheduler.step()


Training on cuda


  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 480/480 [00:22<00:00, 21.73it/s]


Epoch 1/50, Train Loss: 0.857700
Epoch 1/50, Validation Loss: 0.825883


100%|██████████| 480/480 [00:22<00:00, 21.36it/s]


Epoch 2/50, Train Loss: 0.837075
Epoch 2/50, Validation Loss: 0.824164


100%|██████████| 480/480 [00:22<00:00, 21.59it/s]


Epoch 3/50, Train Loss: 0.835649
Epoch 3/50, Validation Loss: 0.823603


100%|██████████| 480/480 [00:22<00:00, 21.81it/s]


Epoch 4/50, Train Loss: 0.835242
Epoch 4/50, Validation Loss: 0.823926


100%|██████████| 480/480 [00:21<00:00, 21.89it/s]


Epoch 5/50, Train Loss: 0.835118
Epoch 5/50, Validation Loss: 0.823721


100%|██████████| 480/480 [00:22<00:00, 21.81it/s]


Epoch 6/50, Train Loss: 0.834997
Epoch 6/50, Validation Loss: 0.823709


100%|██████████| 480/480 [00:21<00:00, 21.98it/s]


Epoch 7/50, Train Loss: 0.834992
Epoch 7/50, Validation Loss: 0.823612


100%|██████████| 480/480 [00:21<00:00, 22.04it/s]


Epoch 8/50, Train Loss: 0.834824
Epoch 8/50, Validation Loss: 0.823737


100%|██████████| 480/480 [00:21<00:00, 22.14it/s]


Epoch 9/50, Train Loss: 0.834935
Epoch 9/50, Validation Loss: 0.823576


100%|██████████| 480/480 [00:21<00:00, 22.02it/s]


Epoch 10/50, Train Loss: 0.834874
Epoch 10/50, Validation Loss: 0.823503


100%|██████████| 480/480 [00:21<00:00, 22.17it/s]


Epoch 11/50, Train Loss: 0.834857
Epoch 11/50, Validation Loss: 0.823549


100%|██████████| 480/480 [00:21<00:00, 22.15it/s]


Epoch 12/50, Train Loss: 0.834795
Epoch 12/50, Validation Loss: 0.823672


100%|██████████| 480/480 [00:21<00:00, 22.08it/s]


Epoch 13/50, Train Loss: 0.834740
Epoch 13/50, Validation Loss: 0.823589


100%|██████████| 480/480 [00:21<00:00, 22.22it/s]


Epoch 14/50, Train Loss: 0.834821
Epoch 14/50, Validation Loss: 0.823517


100%|██████████| 480/480 [00:21<00:00, 22.19it/s]


Epoch 15/50, Train Loss: 0.834848
Epoch 15/50, Validation Loss: 0.823656


100%|██████████| 480/480 [00:21<00:00, 21.97it/s]


Epoch 16/50, Train Loss: 0.834951
Epoch 16/50, Validation Loss: 0.823569


100%|██████████| 480/480 [00:22<00:00, 21.70it/s]


Epoch 17/50, Train Loss: 0.834887
Epoch 17/50, Validation Loss: 0.823589


100%|██████████| 480/480 [00:21<00:00, 21.96it/s]


Epoch 18/50, Train Loss: 0.834805
Epoch 18/50, Validation Loss: 0.823801


100%|██████████| 480/480 [00:21<00:00, 21.91it/s]


Epoch 19/50, Train Loss: 0.834919
Epoch 19/50, Validation Loss: 0.823706


100%|██████████| 480/480 [00:21<00:00, 22.00it/s]


Epoch 20/50, Train Loss: 0.834883
Epoch 20/50, Validation Loss: 0.823574


100%|██████████| 480/480 [00:22<00:00, 21.42it/s]


Epoch 21/50, Train Loss: 0.834885
Epoch 21/50, Validation Loss: 0.823459


100%|██████████| 480/480 [00:22<00:00, 21.42it/s]


Epoch 22/50, Train Loss: 0.834825
Epoch 22/50, Validation Loss: 0.823557


100%|██████████| 480/480 [00:22<00:00, 21.41it/s]


Epoch 23/50, Train Loss: 0.834850
Epoch 23/50, Validation Loss: 0.823548


100%|██████████| 480/480 [00:22<00:00, 21.41it/s]


Epoch 24/50, Train Loss: 0.834796
Epoch 24/50, Validation Loss: 0.823524


100%|██████████| 480/480 [00:22<00:00, 21.39it/s]


Epoch 25/50, Train Loss: 0.834881
Epoch 25/50, Validation Loss: 0.823495


100%|██████████| 480/480 [00:22<00:00, 21.38it/s]


Epoch 26/50, Train Loss: 0.834773
Epoch 26/50, Validation Loss: 0.823476


100%|██████████| 480/480 [00:22<00:00, 21.35it/s]


Epoch 27/50, Train Loss: 0.834776
Epoch 27/50, Validation Loss: 0.823531


100%|██████████| 480/480 [00:22<00:00, 21.49it/s]


Epoch 28/50, Train Loss: 0.834879
Epoch 28/50, Validation Loss: 0.823744


100%|██████████| 480/480 [00:22<00:00, 21.58it/s]


Epoch 29/50, Train Loss: 0.834805
Epoch 29/50, Validation Loss: 0.823554


100%|██████████| 480/480 [00:22<00:00, 21.72it/s]


Epoch 30/50, Train Loss: 0.834740
Epoch 30/50, Validation Loss: 0.823730


100%|██████████| 480/480 [00:22<00:00, 21.72it/s]


Epoch 31/50, Train Loss: 0.834731
Epoch 31/50, Validation Loss: 0.823474


100%|██████████| 480/480 [00:22<00:00, 21.80it/s]


Epoch 32/50, Train Loss: 0.834729
Epoch 32/50, Validation Loss: 0.823442


100%|██████████| 480/480 [00:22<00:00, 21.71it/s]


Epoch 33/50, Train Loss: 0.834775
Epoch 33/50, Validation Loss: 0.823503


100%|██████████| 480/480 [00:22<00:00, 21.78it/s]


Epoch 34/50, Train Loss: 0.834894
Epoch 34/50, Validation Loss: 0.824408


100%|██████████| 480/480 [00:21<00:00, 21.82it/s]


Epoch 35/50, Train Loss: 0.834807
Epoch 35/50, Validation Loss: 0.823817


100%|██████████| 480/480 [00:21<00:00, 21.90it/s]


Epoch 36/50, Train Loss: 0.834777
Epoch 36/50, Validation Loss: 0.823557


100%|██████████| 480/480 [00:22<00:00, 21.42it/s]


Epoch 37/50, Train Loss: 0.834798
Epoch 37/50, Validation Loss: 0.823776


100%|██████████| 480/480 [00:22<00:00, 21.64it/s]


Epoch 38/50, Train Loss: 0.834738
Epoch 38/50, Validation Loss: 0.823503


100%|██████████| 480/480 [00:22<00:00, 21.42it/s]


Epoch 39/50, Train Loss: 0.834692
Epoch 39/50, Validation Loss: 0.823542


100%|██████████| 480/480 [00:22<00:00, 21.29it/s]


Epoch 40/50, Train Loss: 0.834727
Epoch 40/50, Validation Loss: 0.823616


100%|██████████| 480/480 [00:21<00:00, 21.97it/s]


Epoch 41/50, Train Loss: 0.834735
Epoch 41/50, Validation Loss: 0.823553


100%|██████████| 480/480 [00:22<00:00, 21.75it/s]


Epoch 42/50, Train Loss: 0.834829
Epoch 42/50, Validation Loss: 0.823406


100%|██████████| 480/480 [00:22<00:00, 21.77it/s]


Epoch 43/50, Train Loss: 0.834725
Epoch 43/50, Validation Loss: 0.823711


100%|██████████| 480/480 [00:22<00:00, 21.76it/s]


Epoch 44/50, Train Loss: 0.834754
Epoch 44/50, Validation Loss: 0.823525


100%|██████████| 480/480 [00:21<00:00, 21.90it/s]


Epoch 45/50, Train Loss: 0.834723
Epoch 45/50, Validation Loss: 0.823460


100%|██████████| 480/480 [00:22<00:00, 21.80it/s]


Epoch 46/50, Train Loss: 0.834724
Epoch 46/50, Validation Loss: 0.823459


100%|██████████| 480/480 [00:21<00:00, 21.82it/s]


Epoch 47/50, Train Loss: 0.834691
Epoch 47/50, Validation Loss: 0.823535


100%|██████████| 480/480 [00:21<00:00, 21.87it/s]


Epoch 48/50, Train Loss: 0.834849
Epoch 48/50, Validation Loss: 0.823424


100%|██████████| 480/480 [00:22<00:00, 21.25it/s]


Epoch 49/50, Train Loss: 0.834766
Epoch 49/50, Validation Loss: 0.823473


100%|██████████| 480/480 [00:22<00:00, 21.43it/s]


Epoch 50/50, Train Loss: 0.834709
Epoch 50/50, Validation Loss: 0.823756


In [64]:
torch.save(model.state_dict(), 'transformer_model.pt')

## Inference

In [80]:

generated_sequence = seed_sequence.clone()

    # with torch.no_grad():
    #     for _ in range(num_frames_to_generate):
    #         # Get the last sequence_length frames from the generated_sequence
    #         input_seq = generated_sequence[:, -sequence_length:, :]

    #         # Predict the next frame
    #         next_frame = model(input_seq)

    #         # Reshape the predicted frame to have the same dimensions as input_seq
    #         next_frame = next_frame.view(1, 1, -1)

    #         # Append the predicted frame to the generated_sequence
    #         generated_sequence = torch.cat((generated_sequence, next_frame), dim=1)

    # return generated_sequence


def generate_sequence(model, seed_sequence, num_frames_to_generate):
    model.eval()
    generated_sequence = seed_sequence.clone()
    #input_seq = seed_sequence.clone()

    with torch.no_grad():
        # Get the last sequence_length frames from the generated_sequence
        input_seq = generated_sequence[:, -sequence_length:, :]
        print('last', input_seq.shape)

        # Shift the input_seq by one time step to the left and remove the last frame
        #tgt_seq = input_seq[:, :-1, :] # .contiguous()
        tgt_seq = input_seq[:, -1, :] # .contiguous()
        print('tgt', tgt_seq.shape)


        # Predict the next frames
        next_frame = model(input_seq, tgt_seq)
        print('nxt', next_frame.shape)

        # Reshape the predicted frame to have the same dimensions as input_seq
        #next_frame = next_frame.view(1, 1, -1)

        # Append the predicted frame to the generated_sequence
        generated_sequence = torch.cat((generated_sequence, next_frame), dim=1)


    # Extract the generated sequence from the model's output
    generated_sequence = output_seq[:, -num_frames_to_generate:, :]

    return generated_sequence


In [81]:
#seed_sequence = torch.randn(1, sequence_length, input_dim).to(device)  # Random seed sequence

# Use the first sequence_length frames from the dataset as the seed sequence
seed_sequence = raw_data[1000:1000+sequence_length]
#print(seed_sequence.shape)
seed_sequence = (seed_sequence - dataset.input_mean) / dataset.input_std 
seed_sequence = seed_sequence.unsqueeze(0).to(device)  
#print(seed_sequence.shape)
#print(raw_data.shape)
num_frames_to_generate = 1000

generated_sequence = generate_sequence(model, seed_sequence, num_frames_to_generate)
generated_sequence = generated_sequence.squeeze(0)
generated_sequence = generated_sequence.cpu() * dataset.input_std + dataset.input_mean
generated_sequence = generated_sequence.numpy()
print(seed_sequence.shape)


last torch.Size([1, 100, 183])
tgt torch.Size([1, 183])
nxt torch.Size([100, 1, 183])


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 1 but got size 100 for tensor number 1 in the list.

In [57]:
generated_sequence.shape

(100, 1, 183)

In [16]:
def write_bvh_file(file_name, header, predicted_motion):
    # Open the file for writing
    with open(file_name, 'w') as f:
        # Write the header
        f.write(header)
        f.write('\n')
        # Write the motion data
        for frame in predicted_motion:
            frame_str = ' '.join(str(x) for x in frame)
            f.write(frame_str + '\n')

In [17]:
write_bvh_file('out_07.bvh', dataset.header, generated_sequence)
