In [1]:
import os
import numpy as np
import torch
import pickle
import time
import sys

current_directory = os.getcwd()
models_dir = os.path.join(current_directory, '..')
print(models_dir)
sys.path.append(models_dir)

import torch
from torch.utils.data import DataLoader, Dataset
from models import Pose2AudioTransformer
from transformers import EncodecModel
from utils import DanceToMusic
from datetime import datetime
from torch.optim import Adam

/home/azeez/Documents/projects/DanceToMusicApp/ml/notebooks/..


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# assign GPU or CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
# device = torch.device("cpu")

model_id = "facebook/encodec_24khz"
encodec_model = EncodecModel.from_pretrained(model_id)
encodec_model.to(device)
codebook_size = encodec_model.quantizer.codebook_size
sample_rate = 24000

# data_dir = "/Users/azeez/Documents/pose_estimation/DanceToMusicApp/ml/data/samples/5sec_expando_dnb"
data_dir = "/home/azeez/Documents/projects/DanceToMusicApp/ml/data/samples/5sec_expando_dnb_min_training_data"
dataset = DanceToMusic(data_dir, encoder = encodec_model, sample_rate = sample_rate, device=device)
print("Dataset size: ", len(dataset))



Dataset size:  1265


In [3]:
total_params_encodec = sum(p.numel() for p in encodec_model.parameters())
print(f"Total parameters in encodec model: {total_params_encodec}")

Total parameters in encodec model: 14851810


In [4]:
src_pad_idx = 0
trg_pad_idx = 0
learned_weights = '/Users/azeez/Documents/pose_estimation/DanceToMusicApp/ml/model_weights/gen_5_sec_dnb__best_model_11258.7658.pt'
learned_weights  = '/home/azeez/Documents/projects/DanceToMusicApp/ml/model_weights/gen_5_sec_dnb__best_model_0.0328.pt'
# device = torch.device("mps")
embed_size = dataset.data['poses'].shape[2] * dataset.data['poses'].shape[3]
print(dataset.data['poses'].shape)
pose_model = Pose2AudioTransformer(codebook_size, src_pad_idx, trg_pad_idx, device=device, num_layers=4, heads = 4, embed_size=embed_size, dropout = 0.1)
print("Total parameters: ", sum(p.numel() for p in pose_model.parameters()))
pose_model.load_state_dict(torch.load(learned_weights, map_location=device))
pose_model.to(device)

torch.Size([1265, 120, 32, 3])
Total parameters:  1429472


Pose2AudioTransformer(
  (encoder): Encoder(
    (position_embedding): Embedding(2000, 96)
    (layers): ModuleList(
      (0-3): 4 x TransformerBlock(
        (attention): SelfAttention(
          (values): Linear(in_features=24, out_features=24, bias=False)
          (keys): Linear(in_features=24, out_features=24, bias=False)
          (queries): Linear(in_features=24, out_features=24, bias=False)
          (fc_out): Linear(in_features=96, out_features=96, bias=True)
        )
        (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=96, out_features=384, bias=True)
          (1): ReLU()
          (2): Linear(in_features=384, out_features=96, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): Decoder(
    (codebook_embedding): Embedding(1024

In [26]:
from IPython.display import Audio, display


def audioCodeToWav(audio_code, encodec_model, sample_rate = 24000, device='cpu'):
    audio_code = audio_code.reshape(1,1,2,int(audio_code.shape[2]))
    audio_code = audio_code.to(device)
    audio_scale = [None]
    print(audio_code.shape)
    print(audio_code.device, encodec_model.device)
    wav = encodec_model.decode(audio_code, audio_scale)
    return wav

In [28]:
audio_codes, pose, pose_mask, wav, wav_mask, wav_path, sr = dataset[0]
print(f"Pose shape: {pose.shape}, Pose mask shape: {pose_mask.shape}, Audio code shape: {audio_codes.shape}, Audio wav shape: {wav.shape}, Audio wav mask shape: {wav_mask.shape}")
print(audio_codes.shape, )
output = pose_model.generate(pose.unsqueeze(0).to(device), pose_mask.to(device), max_length = audio_codes.shape[0], temperature = 1)
print(output[0][:20])
print(output.shape)
wav = audioCodeToWav(output.unsqueeze(0), encodec_model, sample_rate = 24000, device=device)['audio_values']
display(Audio(wav[0].detach().to('cpu').numpy(), rate=24000))

Pose shape: torch.Size([120, 32, 3]), Pose mask shape: torch.Size([120]), Audio code shape: (377, 2), Audio wav shape: torch.Size([1, 120001]), Audio wav mask shape: torch.Size([120001])
(377, 2)
tensor([[ 59, 291],
        [148, 953],
        [446, 878],
        [817,  60],
        [922, 720],
        [945, 877],
        [903, 367],
        [769, 582],
        [248, 589],
        [413,  23],
        [  4, 111],
        [419, 481],
        [528,   9],
        [661, 419],
        [948, 663],
        [231, 946],
        [164, 440],
        [129, 249],
        [884, 117],
        [978, 375]], device='cuda:0')
torch.Size([1, 377, 2])
torch.Size([1, 1, 2, 377])
cuda:0 cuda:0


In [None]:
from IPython.display import Video

vid_path = wav_path.split('.')[0]+'_with_audio.mp4'
print(vid_path)
Video(vid_path)

/Users/azeez/Documents/pose_estimation/DanceToMusic/data/samples/5sec_expando_dataset/youtube_links_67_14_sample/youtube_links_67_14_with_audio.mp4


In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
import numpy as np
import soundfile as sf
from scipy.io import wavfile
import os

# Assuming `wav` is a PyTorch tensor with your new audio data
# and `vid_path` is the path to your original video file.

# Specify the path to save the output video and the temporary audio
generated_output_video = '../assets/sample_generated_audio.mp4'
og_output_video = '../assets/sample_original.mp4'
temp_audio_dir = '../assets'
temp_audio_path = os.path.join(temp_audio_dir, 'generated_audio.wav')

os.makedirs(temp_audio_dir, exist_ok=True)

wav_np = wav[0].detach().cpu().numpy()
max_val = np.max(np.abs(wav_np))
normalized_wav = wav_np / max_val
scaled_wav = np.int16(normalized_wav * 32767)
wavfile.write(filename=temp_audio_path, rate=24000, data=scaled_wav.T)

# Save the original video clip to the assets folder 
original_video_clip = VideoFileClip(vid_path)
original_video_clip.write_videofile(og_output_video)

# Now create the video clip with the new audio
video_clip = VideoFileClip(vid_path)
new_audio_clip = AudioFileClip(temp_audio_path)

new_audio_clip = CompositeAudioClip([new_audio_clip])
video_clip.audio = new_audio_clip
video_clip.write_videofile(generated_output_video)

# Close the clips to release their resources
video_clip.close()
new_audio_clip.close()
original_video_clip.close()

# os.remove(temp_audio_path)

Video(generated_output_video)

Moviepy - Building video ../assets/sample_original.mp4.
MoviePy - Writing audio in sample_originalTEMP_MPY_wvf_snd.mp3


                                                        

MoviePy - Done.
Moviepy - Writing video ../assets/sample_original.mp4



                                                               

Moviepy - Done !
Moviepy - video ready ../assets/sample_original.mp4
Moviepy - Building video ../assets/sample_generated_audio.mp4.
MoviePy - Writing audio in sample_generated_audioTEMP_MPY_wvf_snd.mp3


                                                        

MoviePy - Done.
Moviepy - Writing video ../assets/sample_generated_audio.mp4



                                                               

Moviepy - Done !
Moviepy - video ready ../assets/sample_generated_audio.mp4


