In [None]:
from preprocess import *
tensor,sr = load_mp3_files("../dataset")

for i in tensor:
    print(f"tensor{i}.shape: {i.shape}")
print(f"Sampling rate: {sr}")


In [None]:
# make them into batched x,y. 
tensor_stack = torch.cat(tensor,dim=-1)
print(f"tensor_stack.shape: {tensor_stack.shape}")

In [None]:
ck_len = 512*600 # for first 32 seconds, we predict the next 32 seconds (sampling rate = 8000)

chunks = create_overlapping_chunks_tensor(tensor_stack,chunk_len=ck_len)
print(chunks.shape) #torch.Size([706, 96000])
x,y = chunks[:,:ck_len//2], chunks[:,ck_len//2:]
print(f"x: {x.shape}")
print(f"y: {y.shape}")




In [None]:

indices = torch.randperm(x.size(0))

shuffled_x,shuffled_y = x[indices],y[indices]

dSet = {
    'x': shuffled_x[:3450,:],
    'y': shuffled_y[:3450,:],
    'x_test': shuffled_x[3450:,:],
    'y_test': shuffled_y[3450:,:],
}
from torch.utils.data import TensorDataset,DataLoader
trainDataset,testDataset = TensorDataset(dSet['x'],dSet['y']),TensorDataset(dSet['x_test'],dSet['y_test'])
dLoader,dLoader_test = DataLoader(trainDataset,batch_size=1,shuffle=True),DataLoader(testDataset,batch_size=1,shuffle=False)

In [None]:

from layers.main_model import net
LR=1e-4
device = torch.device('cuda:0')

model = net(sequence_length=512*300,num_blocks=6,activation='silu').to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=LR,)
num_epochs = 500
train_losses = []
test_losses = []



In [None]:

import wandb
from datetime import datetime
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

wandb.init(project="audio-gen", config={
    "epochs": num_epochs,
    "batch_size": len(dLoader),  # assuming dLoader gives one batch per step
    "learning_rate": LR,
    "device": "cuda RTX 3080 Ti",
    "ck_len": ck_len,
    "num_blocks": 6,
    },
    name=f"run_{current_time}"
)

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    for i,(inputs, labels) in enumerate(dLoader):
        
        # Zero the parameter gradients
        if torch.any(torch.isnan(inputs)) or torch.any(torch.isnan(labels)):
            print("Input or labels contain NaN values.")
            
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=10.0)
        optimizer.step()
        wandb.log({'single_loss': loss.item()})
        # Accumulate loss
        running_loss += loss.item()
    
    # Average loss for the epoch
    epoch_loss = running_loss / len(dLoader)
    train_losses.append(epoch_loss)
    wandb.log({"epoch_train_loss": epoch_loss})
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}')
    
    
    if epoch % 5 == 0:
        torch.save(model.state_dict(), f'models/modelDict_epoch_{epoch+1}.pth')
        torch.save(model, f'models/model_epoch_{epoch+1}.pth')
        model.eval() 
        running_loss_test = 0.0

        with torch.no_grad(): 
            for inputs, labels in dLoader_test:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                for name, param in model.named_parameters():
                    if param.grad is not None and torch.any(torch.isnan(param.grad)):
                        print(f"Gradient for {name} contains NaN values.")
                        
                loss = criterion(outputs, labels)
                running_loss_test += loss.item()
         
        
        
        
        
        # Average validation loss for the epoch
        epoch_test_loss = running_loss_test / len(dLoader_test)
        test_losses.append(epoch_test_loss)
        wandb.log({"eval_loss":epoch_test_loss})    

        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {epoch_test_loss:.4f}')
    
# At the end, you can plot the losses if needed

In [None]:
import torch
import torch.nn as nn
import torchaudio
import os

from layers.main_model import net

device = torch.device('cpu')
model = torch.load("model_epoch_45.pth")
criterion = nn.MSELoss()

# Assuming dLoader_test is your test dataloader and each input/label has shape (1, L)
output_folder = "eval_audio"  # Folder to store audio files
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Evaluation mode
model.eval() 

with torch.no_grad(): 
    for batch_idx, (inputs, labels) in enumerate(dLoader_test):
        inputs = inputs.to(device)  # Shape (1, L)
        labels = labels.to(device)  # Shape (1, L)

        # Forward pass
        outputs = model(inputs)  # Shape (1, L)

        # Concatenate inputs, labels, and outputs
        concatenated_audio = torch.cat((inputs, outputs,labels), dim=1)  # Shape (1, 3*L)

        # Convert to CPU and detach (if necessary)
        concatenated_audio = concatenated_audio.cpu().detach()

        # Save as audio (normalize to [-1, 1] if needed)
        file_name = f"{output_folder}/audio_batch_{batch_idx}.wav"
        torchaudio.save(file_name, concatenated_audio, sample_rate=SR)

        print(f"Saved concatenated audio for batch {batch_idx} to {file_name}")


In [None]:
import torch
from layers.main_model import net
def print_model_info(model):
    # Calculate total number of parameters
    total_params = sum(p.numel() for p in model.parameters())
    
    # Assuming parameters are stored as float32 (4 bytes per parameter)
    param_size_bytes = total_params * 4  # 4 bytes for float32
    param_size_gb = param_size_bytes / (1024 ** 3)  # Convert to GB
    
    print(f"Total number of parameters: {total_params:,}")
    print(f"Model size: {param_size_gb:.2f} GB")

# Example usage
model = net(sequence_length=8000*30, num_blocks=12, activation='silu',lstm_option=True)
print_model_info(model)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Define the diffusion model
class DiffusionModel(nn.Module):
    def __init__(self, sequence_length, model_dim, timesteps):
        super(DiffusionModel, self).__init__()
        self.sequence_length = sequence_length
        self.model_dim = model_dim
        self.timesteps = timesteps

        # Define an embedding for input sequences
        self.embedding = nn.Embedding(sequence_length, model_dim)

        # Define a simple transformer or GRU layer for sequence processing
        self.seq_model = nn.GRU(model_dim, model_dim, batch_first=True)

        # Linear layers for predicting the noise at each timestep
        self.predict_noise = nn.Linear(model_dim, sequence_length)

    def forward(self, x, t):
        # Embed the input sequence
        x_emb = self.embedding(x)

        # Process the sequence through GRU (or Transformer for more complex cases)
        seq_output, _ = self.seq_model(x_emb)

        # Predict noise at timestep t
        noise_pred = self.predict_noise(seq_output)

        return noise_pred

# Diffusion process
def forward_diffusion(sequence, timesteps, beta_schedule):
    """
    Add noise progressively to the sequence.
    sequence: (batch_size, sequence_length)
    """
    noise = torch.randn_like(sequence)
    alphas = 1 - beta_schedule
    alphas_cumprod = torch.cumprod(alphas, dim=0)

    noised_sequence = alphas_cumprod[timesteps] * sequence + (1 - alphas_cumprod[timesteps]) * noise
    return noised_sequence, noise

# Reverse process (denoising)
def reverse_diffusion(model, noised_sequence, timesteps):
    """
    Reverse diffusion process to denoise.
    noised_sequence: (batch_size, sequence_length)
    """
    for t in range(timesteps, 0, -1):
        noise_pred = model(noised_sequence, t)
        noised_sequence = noised_sequence - noise_pred
    return noised_sequence

# Beta schedule (linear for simplicity)
def get_beta_schedule(timesteps, start=0.0001, end=0.02):
    return torch.linspace(start, end, timesteps)

# Example usage
sequence_length = 10
model_dim = 32
timesteps = 100
batch_size = 4

# Initialize the model
model = DiffusionModel(sequence_length=sequence_length, model_dim=model_dim, timesteps=timesteps)

# Create random sequence
sequence = torch.randint(0, sequence_length, (batch_size, sequence_length))

# Forward diffusion (add noise)
beta_schedule = get_beta_schedule(timesteps)
noised_sequence, noise = forward_diffusion(sequence, timesteps=50, beta_schedule=beta_schedule)

# Reverse diffusion (denoise)
reconstructed_sequence = reverse_diffusion(model, noised_sequence, timesteps=50)

print("Original Sequence:", sequence)
print("Reconstructed Sequence:", reconstructed_sequence)


In [12]:
from rotary_embedding_torch import RotaryEmbedding

rotary = RotaryEmbedding(dim=512,seq_before_head_dim=False,freqs_for='lang',interpolate_factor=1,cache_if_possible=True)
import torch

f = torch.randn((1, 8, 3, 512))

ff = rotary.rotate_queries_or_keys(f)

print(ff.shape)

torch.Size([1, 8, 3, 512])


In [None]:

from encoder.utils import convert_audio
import torchaudio
import torch
from decoder.pretrained import WavTokenizer

device=torch.device('cpu')

config_path = "./configs/xxx.yaml"
model_path = "./xxx.ckpt"

wavtokenizer = WavTokenizer.from_pretrained0802(config_path, model_path)
wavtokenizer = wavtokenizer.to(device)

wav, sr = torchaudio.load(audio_path)
wav = convert_audio(wav, sr, 24000, 1) 
bandwidth_id = torch.tensor([0])
wav=wav.to(device)
_,discrete_code= wavtokenizer.encode_infer(wav, bandwidth_id=bandwidth_id)
print(discrete_code)
