In [None]:
from layers.preprocess import *
#tensor,sr = load_mp3_files("../dataset")
tensor = load_audio("../dataset/no8/0/audio0.mp3",config={
    "sr":8000,
    "hop_length":512,
    "cut_first":8000
})
#for i in tensor:
    #print(f"tensor{i}.shape: {i.shape}")
print(tensor.shape)


In [None]:
from layers.tools.audios import RevSTFT
from layers.diffusion.karras import KarrasNoiseAdder
# Instantiate the noise adder
print(tensor.std())
tensor = tensor * 0.5 / tensor.std()
print(tensor.std())
noise_adder = KarrasNoiseAdder(sigma_data=tensor.std(dim=-1))

f = RevSTFT(config={
    "n_fft":1024,
    "win_len":1024,
    "hop_length":512
})
a,b = f.transform(tensor)
x = torch.cat((a.transpose(-1,-2),b.transpose(-1,-2)),dim=-1)
print(x.shape)
# Assuming you have an input tensor `x` and a list of sigmas
num_steps = 30  # Example number of steps
#sigmas = KarrasSchedule(sigma_data=tensor.std(dim=-1)).forward(num_steps,device=torch.device("cpu"))  # Create a noise schedule

steps = torch.arange(num_steps, dtype=torch.float32) 
schuduled_sigmas = (
    30 ** 0.33333333333
    + (steps / (num_steps - 1)) * (1e-6 ** 0.333333333 - 30 ** 0.33333333)
) ** 3
# sigmas maximum -> minimum, as sampling method goes backward(T to 0)
# Although original paper suggested maximum=80, We should go with maximum=0.8~3, as that's expected noise range used in training step is around there. (Also to reduce cost)
sigmas = torch.cat((schuduled_sigmas,schuduled_sigmas.new_zeros([1])))


In [None]:
print(x.shape)
print(sigmas.shape)
x_noisy = noise_adder(x, sigmas)  # x is your original input tensor


In [None]:

noised_returns = noise_adder.noised_x
print(noised_returns[0].shape)
stft_to_audio = [f.inverse(aa[:,:,:513],aa[:,:,513:]) for aa in noised_returns]




In [None]:
print(stft_to_audio[0].shape)

import torchaudio

def save_audio(tensor, filename, sample_rate=8000):
    # Ensure the tensor is of the right shape
    tensor = tensor * 0.0563 / 0.5
    if tensor.dim() == 1:
        tensor = tensor.unsqueeze(0)  # Add a channel dimension

    # Save the audio file
    torchaudio.save(filename, tensor, sample_rate)

import os
os.makedirs("audio_samples",exist_ok=True)
for i,audio in enumerate(stft_to_audio):

    save_audio(audio, f"audio_samples/output_audio{i}.wav", sample_rate=8000)



In [None]:
print(tensor.std(dim=-1))

In [None]:
# make them into batched x,y. 
#tensor_stack = torch.cat(tensor,dim=-1)
tensor_stack = tensor

print(f"tensor_stack.shape: {tensor_stack.shape}")

ck_len = 512*20 # for first 32 seconds, we predict the next 32 seconds (sampling rate = 8000)

chunks = create_overlapping_chunks_tensor(tensor_stack,chunk_len=ck_len)
print(chunks.shape) #torch.Size([706, 96000])
x= chunks
print(f"x: {x.shape}")


indices = torch.randperm(x.size(0))

shuffled_x = x[indices]

dSet = {
    'x': shuffled_x[:3350,:],
    'x_test': shuffled_x[3450:,:],
}
from torch.utils.data import TensorDataset,DataLoader
trainDataset,testDataset = TensorDataset(dSet['x']),TensorDataset(dSet['x_test'])
dLoader,dLoader_test = DataLoader(trainDataset,batch_size=32,shuffle=True),DataLoader(testDataset,batch_size=32,shuffle=False)


In [None]:

from layers.core import net
LR=1e-4
device = torch.device('cuda:0')

model = net(sequence_length=512*20,num_blocks=6,activation='silu').to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=LR,)
num_epochs = 500
train_losses = []
test_losses = []



In [None]:

import wandb
from datetime import datetime
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

wandb.init(project="audio-gen", config={
    "epochs": num_epochs,
    "batch_size": len(dLoader),  # assuming dLoader gives one batch per step
    "learning_rate": LR,
    "device": "cuda RTX 3080 Ti",
    "ck_len": ck_len,
    "num_blocks": 6,
    },
    name=f"run_{current_time}"
)

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    for i,(inputs, labels) in enumerate(dLoader):
        
        # Zero the parameter gradients
        if torch.any(torch.isnan(inputs)) or torch.any(torch.isnan(labels)):
            print("Input or labels contain NaN values.")
            
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
        optimizer.step()
        wandb.log({'single_loss': loss.item()})
        # Accumulate loss
        running_loss += loss.item()
    
    # Average loss for the epoch
    epoch_loss = running_loss / len(dLoader)
    train_losses.append(epoch_loss)
    wandb.log({"epoch_train_loss": epoch_loss})
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}')
    
    
    if epoch % 5 == 0:
        torch.save(model.state_dict(), f'models/modelDict_epoch_{epoch+1}.pth')
        torch.save(model, f'models/model_epoch_{epoch+1}.pth')
        model.eval() 
        running_loss_test = 0.0

        with torch.no_grad(): 
            for inputs, labels in dLoader_test:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                for name, param in model.named_parameters():
                    if param.grad is not None and torch.any(torch.isnan(param.grad)):
                        print(f"Gradient for {name} contains NaN values.")
                        
                loss = criterion(outputs, labels)
                running_loss_test += loss.item()
         
        
        
        
        
        # Average validation loss for the epoch
        epoch_test_loss = running_loss_test / len(dLoader_test)
        test_losses.append(epoch_test_loss)
        wandb.log({"eval_loss":epoch_test_loss})    

        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {epoch_test_loss:.4f}')
    
# At the end, you can plot the losses if needed

In [None]:
import torch
import torch.nn as nn
import torchaudio
import os

from layers.core import net

device = torch.device('cpu')
model = torch.load("model_epoch_45.pth")
criterion = nn.MSELoss()

# Assuming dLoader_test is your test dataloader and each input/label has shape (1, L)
output_folder = "eval_audio"  # Folder to store audio files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Evaluation mode
model.eval() 

with torch.no_grad(): 
    for batch_idx, (inputs, labels) in enumerate(dLoader_test):
        inputs = inputs.to(device)  # Shape (1, L)
        labels = labels.to(device)  # Shape (1, L)

        outputs = model(inputs)  # Shape (1, L)
        concatenated_audio = torch.cat((inputs, outputs,labels), dim=1)  # Shape (1, 3*L)

        concatenated_audio = concatenated_audio.cpu().detach()
        file_name = f"{output_folder}/audio_batch_{batch_idx}.wav"
        torchaudio.save(file_name, concatenated_audio, sample_rate=SR)

        print(f"Saved concatenated audio for batch {batch_idx} to {file_name}")


In [None]:

from encoder.utils import convert_audio
import torchaudio
import torch
from decoder.pretrained import WavTokenizer

device=torch.device('cpu')

config_path = "./configs/xxx.yaml"
model_path = "./xxx.ckpt"

wavtokenizer = WavTokenizer.from_pretrained0802(config_path, model_path)
wavtokenizer = wavtokenizer.to(device)

wav, sr = torchaudio.load(audio_path)
wav = convert_audio(wav, sr, 24000, 1) 
bandwidth_id = torch.tensor([0])
wav=wav.to(device)
_,discrete_code= wavtokenizer.encode_infer(wav, bandwidth_id=bandwidth_id)
print(discrete_code)


In [7]:
import torch
import torch.nn as nn
def print_model_size(model):
    """
    Print the total number of parameters in a PyTorch model.

    Args:
        model (nn.Module): The PyTorch model whose size you want to print.
    """
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total number of parameters: {total_params}")

from layers.core2 import UNetWithMHA
from layers.core import net
# Create an instance of the model

model = UNetWithMHA(config={
    'seq_len': 380000,
    'n_fft':2048,
    'win_len':2048,
    'hop_length':20,
})

model2 = net(config={
    'seq_len': 380000,
    'n_fft':2048,
    'win_len':2048,
    'hop_length':20,
    'num_blocks':6
})

# Call the function to print model parameters
print_model_size(model)
print_model_size(model2)

Total number of parameters: 95267657
Total number of parameters: 239092376


In [24]:
import torch
from rotary_embedding_torch import RotaryEmbedding

# instantiate the positional embedding in your transformer and pass to all your attention layers

rotary_emb = RotaryEmbedding(dim = 32)

# mock queries and keys - dimensions should end with (seq_len, feature dimension), and any number of preceding dimensions (batch, heads, etc)

q = torch.randn(1, 8, 1024, 64) # queries - (batch, heads, seq len, dimension of head)
k = torch.randn(1, 8, 1024, 64) # keys

# apply the rotations to your queries and keys after the heads have been split out, but prior to the dot product and subsequent softmax (attention)
print(q[0,1,1,1])
qq = rotary_emb.rotate_queries_or_keys(q)
kk = rotary_emb.rotate_queries_or_keys(k)
print(qq[0,1,1,1])
# then do your attention with your queries (q) and keys (k) as usual



tensor(-0.3740)
tensor(-0.5399)


In [22]:
f = (qq-q)
ff = (kk - k)

In [30]:
import torch
from rotary_embedding_torch import RotaryEmbedding

# Instantiate the positional embedding in your transformer
rotary_emb = RotaryEmbedding(dim=16)

# Mock queries and keys
q = torch.randn(1, 8, 1024, 64)  # queries
k = torch.randn(1, 8, 1024, 64)  # keys

# Print initial sample value and range
print("Initial q sample value:", q[0, 1, 1, 1].item())
print("Initial q range:", (q.min().item(), q.max().item()))

# Apply rotations
qq = rotary_emb.rotate_queries_or_keys(q)
kk = rotary_emb.rotate_queries_or_keys(k)

# Print post-rotation sample value and range
print("Rotated q sample value:", qq[0, 1, 1, 1].item())
print("Rotated q range:", (qq.min().item(), qq.max().item()))


Initial q sample value: 2.1315560340881348
Initial q range: (-4.752716064453125, 4.391259670257568)
Rotated q sample value: 0.9357888698577881
Rotated q range: (-4.752716064453125, 4.988086700439453)
