In [1]:
import torch
from models.cicada_base import CicadaBaseAutoencoder
from models.cicada_custom import CicadaCustomAutoencoder
from spectro_data import SpectrogramDataset, specToAudio
from torch.utils.data import DataLoader, random_split


batch_size = 32
# MODEL_CKPT = "ckpts/cicadence_base_epoch20.pth"
MODEL_CKPT = "ckpts/cicadence_custom_epoch5.pth"
NOISY_DATA_PATH = "data/processed/28spk/noisy_specs.pt"
CLEAN_DATA_PATH = "data/processed/28spk/clean_specs.pt"
torch.manual_seed(42) #Consistent results

<torch._C.Generator at 0x112a7b030>

In [2]:
data = SpectrogramDataset(NOISY_DATA_PATH, CLEAN_DATA_PATH)

train_size = int(0.8 * len(data))
val_size = int(0.15 * len(data))
test_size = len(data) - train_size - val_size  # Ensure all samples are used

train_set, val_set, test_set = random_split(data, [train_size, val_size, test_size])
print(f"Train: {len(train_set)}, Val: {len(val_set)}, Test: {len(test_set)}")


train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

Train: 9257, Val: 1735, Test: 580


In [3]:
eval_batch = next(iter(test_loader))

In [4]:
noisy_spec, clean_spec = eval_batch  # (batch_size, 1, freq_bins, time_frames)
print(noisy_spec[0].shape)

torch.Size([1, 256, 290])


In [5]:
# Select a single sample from the batch
index = 31  # Change this to play different samples in the batch
noisy_np = noisy_spec[index].squeeze().cpu().numpy()
clean_np = clean_spec[index].squeeze().cpu().numpy()

In [6]:
#Clean Sample
specToAudio(clean_np)

In [7]:
# model = CicadaBaseAutoencoder()
model = CicadaCustomAutoencoder()
model.load_state_dict(torch.load(MODEL_CKPT, map_location='cpu'))
model.eval()

CicadaCustomAutoencoder(
  (encoder): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU()
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (decoder): Sequential(
    (0): ConvTranspose2d(64, 32, kernel_size=(2, 2), stride=(2, 2))
    (1): ReLU()
    (2): Conv2d(32, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(16, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): Sigmoid()
  )
)

In [10]:
specProcessed = model(noisy_spec[index]).detach()

In [8]:
#Noisy sample (model input)
specToAudio(noisy_np)

In [11]:
#Processed sample (model output)
specToAudio(specProcessed.squeeze().cpu().numpy())