In [2]:
import os
import numpy as np
import torch
import pickle
import time

import torch
from torch.utils.data import DataLoader, Dataset
from models import Pose2AudioTransformer
from transformers import EncodecModel
from utils import DanceToMusic
from datetime import datetime
from torch.optim import Adam

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# assign GPU or CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
# device = torch.device("cpu")

model_id = "facebook/encodec_24khz"
encodec_model = EncodecModel.from_pretrained(model_id)
encodec_model.to(device)
codebook_size = encodec_model.quantizer.codebook_size
sample_rate = 24000

data_dir = "/Users/azeez/Documents/pose_estimation/DanceToMusic/data/samples/5sec_min_data"
dataset = DanceToMusic(data_dir, encoder = encodec_model, sample_rate = sample_rate, device=device)
print("Dataset size: ", len(dataset))



Dataset size:  419


In [14]:
src_pad_idx = 0
trg_pad_idx = 0
# learned_weights = '/Users/azeez/Documents/pose_estimation/DanceToMusic/weights/5_sec_best_model_weights_loss_6.733452348148122.pth' 
learned_weights = '/Users/azeez/Documents/pose_estimation/DanceToMusic/weights/5_sec_transformer_expando__best_model_5.3600.pt'
# device = torch.device("mps")
embed_size = dataset.data['poses'].shape[2] * dataset.data['poses'].shape[3]
pose_model = Pose2AudioTransformer(codebook_size, src_pad_idx, trg_pad_idx, device=device, num_layers=4, heads = 4, embed_size=embed_size, dropout = 0.1)
pose_model.load_state_dict(torch.load(learned_weights, map_location=device))
pose_model.to(device)

Pose2AudioTransformer(
  (encoder): Encoder(
    (position_embedding): Embedding(2000, 96)
    (layers): ModuleList(
      (0-3): 4 x TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=24, out_features=24, bias=False)
          (keys): Linear(in_features=24, out_features=24, bias=False)
          (queries): Linear(in_features=24, out_features=24, bias=False)
          (fc_out): Linear(in_features=96, out_features=96, bias=True)
        )
        (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=96, out_features=384, bias=True)
          (1): ReLU()
          (2): Linear(in_features=384, out_features=96, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (codebook_embedding): Embedding(1024

In [15]:
from IPython.display import Audio, display


def audioCodeToWav(audio_code, encodec_model, sample_rate = 24000, device='cpu'):
    audio_code = audio_code.reshape(1,1,2,int(audio_code.size(2)/2))
    audio_code = audio_code.to(device)
    audio_scale = [None]
    wav = encodec_model.decode(audio_code, audio_scale)
    return wav

In [17]:
audio_codes, pose, pose_mask, wav, wav_mask, _, _ = dataset[0]
output = pose_model.generate(pose.unsqueeze(0).to(device), pose_mask.to(device), max_length = audio_codes.shape[1]+1, temperature = 1)
print(output[0][:20])
print(output.shape)
wav = audioCodeToWav(output.unsqueeze(0), encodec_model, sample_rate = 24000, device=device)['audio_values']
display(Audio(wav[0].detach().numpy(), rate=24000))

tensor([295, 666, 133, 212, 861, 117,  96, 564, 916, 317, 913, 161, 909, 371,
        951, 558, 386, 421, 273, 100])
torch.Size([1, 754])


In [12]:
for i in range(5):
    audio_codes, pose, pose_mask, wav, wav_mask, _, _ = dataset[i]
    sample = pose_model.generate(pose.unsqueeze(0).to(device), pose_mask.to(device), max_length = 100)
    print(sample[0,:10])

tensor([559, 915, 985, 611, 383, 915, 915, 565, 649, 639])
tensor([615, 632, 527, 262, 765, 838, 416, 552, 293, 908])
tensor([243, 156, 388, 372, 272, 989, 810, 576, 631, 810])
tensor([258,  32, 525, 468, 264, 820, 426, 299, 734, 268])
tensor([525, 302, 319, 331, 440, 970, 319, 998, 736,  49])


In [None]:
encodec_model = encodec_model.to('cpu')
wav = audioCodeToWav(output[0], encodec_model, sample_rate = 24000)['audio_values']
print(wav.shape)
display(Audio(wav[0][0].detach().numpy(), rate=24000))

torch.Size([1, 1, 32000])


In [None]:
output[0].shape, audio_codes[0].shape

(torch.Size([60]), torch.Size([2, 759]))

In [None]:
output[0], audio_codes[0][0]

(tensor([401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401,
         401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401,
         401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401,
         401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401, 401,
         401, 401, 401, 401], device='mps:0'),
 tensor([ 121,  395,  537,  537,  662,  401,   34,  568,  844,  572,  231,  758,
          715,  637,  790,  568,  446,  657, 1021,  657,  419,  713,  322,  568,
          568,  568,  924,  560,  713,  384,  445,  754,  509,  362,  568,  434,
          797,  352,  246,  189,  568,  713,  659,  568,  568,  568,  568,  659,
          560,  169,  560,  701,  788,  659,  817,  437,  560,  531,  560,  782,
          568,  568,  568,  560,  543,  654,  631,  152,  152,  715,  388,  388,
          388,  366,  844,  568,  388,  388,  388,  388,  213,  213,  213,  560,
          388,  388,  659,  790,  830,  713, 1021,  790,  322,  560,  

In [None]:
print(output.shape)
print(output[0].reshape(1,1,2,int(output.size(1)/2)).shape)
wav = audioCodeToWav(output.unsqueeze(0), encodec_model, sample_rate = 24000, device=device)['audio_values']
display(Audio(wav[0].detach().numpy(), rate=24000))

torch.Size([1, 754])
torch.Size([1, 1, 2, 377])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)