In [1]:
import os
import yaml
import torch
from torch import nn
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import torchaudio
from transformers import AutoProcessor, ASTModel
from torchaudio.transforms import MelSpectrogram, MFCC

from acoustic_anomaly_detection.dataset import AudioDataset

params = yaml.safe_load(open("params.yaml"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ast = AutoProcessor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [2]:
from torch.utils.data import Dataset, DataLoader
import torchaudio
from acoustic_anomaly_detection.utils import get_attributes

class ASTProcessor(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.ast = AutoProcessor.from_pretrained(
            "MIT/ast-finetuned-audioset-10-10-0.4593"
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.ast(
            x.squeeze(0),
            sampling_rate=params["transform"]["params"]["sr"],
            return_tensors="pt",
        )
        return x["input_values"]

class AudioDataset(Dataset):
    def __init__(
        self,
        file_list: list,
    ) -> None:
        self.file_list = file_list
        self.seed = params["train"]["seed"]
        self.data_sources = params["data"]["data_sources"]
        self.transform_type = params["transform"]["type"]
        self.segment = params["transform"]["segment"]
        self.sr = params["transform"]["params"]["sr"]
        self.duration = params["transform"]["params"]["duration"]
        self.length = self.sr * self.duration
        self.transform_func = ASTProcessor()

    def __len__(self) -> int:
        return len(self.file_list)
    
    def cut(self, signal: torch.Tensor) -> torch.Tensor:
        if signal.shape[1] > self.length:
            signal = signal[:, : self.length]
        elif signal.shape[1] < self.length:
            signal = torch.nn.functional.pad(signal, (0, self.length - signal.shape[1]))
        return signal

    def resample(self, signal: torch.Tensor, sr: int) -> torch.Tensor:
        if sr != self.sr:
            resampler = torchaudio.transforms.Resample(sr, self.sr)
            signal = resampler(signal)
        return signal

    def mix_down(self, signal: torch.Tensor) -> torch.Tensor:
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def transform(self, signal: torch.Tensor, sr: int) -> torch.Tensor:
        signal = self.resample(signal, sr)
        signal = self.mix_down(signal)
        signal = self.cut(signal)
        signal = self.transform_func(signal)
        return signal
    
    def __getitem__(self, idx) -> tuple[torch.Tensor, dict[str, str]]:
        file_path = self.file_list[idx]
        signal, sr = torchaudio.load(file_path)
        attributes = get_attributes(os.path.basename(file_path))
        return self.transform(signal, sr), attributes

In [3]:
model = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")#, max_length=998, ignore_mismatched_sizes=True)

In [2]:
import lightning.pytorch as pl
from torcheval.metrics.functional import binary_auroc, binary_auprc

from acoustic_anomaly_detection.utils import slide_window, reverse_slide_window

class Model(pl.LightningModule):
    def __init__(self, model_name: str, input_size: int):
        super().__init__()
        self.model_name = model_name
        window_size = params["transform"]["params"]["window_size"]
        stride = params["transform"]["params"]["stride"]
        self.input_size = ((input_size - window_size) // stride + 1) * window_size

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        raise NotImplementedError

    def training_step(
        self, batch: tuple[torch.Tensor, dict[str, str]], batch_idx: int
    ) -> torch.Tensor:
        x, _ = batch
        x = nn.Flatten(0, 1)(x)
        x_hat = self(x)
        loss = nn.functional.mse_loss(x_hat, x)
        self.log(
            f"{self.model_name}_train_loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return loss

    def validation_step(
        self, batch: tuple[torch.Tensor, dict[str, str]], batch_idx: int
    ) -> torch.Tensor:
        x, _ = batch
        x = nn.Flatten(0, 1)(x)
        x_hat = self(x)
        loss = nn.functional.mse_loss(x_hat, x)
        self.log(
            f"{self.model_name}_val_loss",
            loss,
            on_step=True,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return loss

    def test_step(
        self, batch: tuple[torch.Tensor, dict[str, str]], batch_idx: int
    ) -> None:
        x, attributes = batch
        x = nn.Flatten(0, 1)(x)
        x_hat = self(x)
        error_score = torch.mean(torch.square(x_hat - x))
        self.error_score.append(error_score.item())
        y = 1 if attributes["label"] == "anomaly" else 0
        self.y.append(y)

    def on_test_epoch_start(self) -> None:
        self.error_score = []
        self.y = []

    def on_test_epoch_end(self) -> None:
        error_score = torch.tensor(self.error_score)
        y = torch.tensor(self.y)
        auroc = binary_auroc(error_score, y)
        auprc = binary_auprc(error_score, y)
        self.log(f"{self.model_name}_auroc_epoch", auroc, prog_bar=True, logger=True)
        self.log(f"{self.model_name}_auprc_epoch", auprc, prog_bar=True, logger=True)

    def configure_optimizers(self) -> torch.optim.Optimizer:
        return torch.optim.Adam(self.parameters(), lr=params["train"]["lr"])


class SimpleAE(Model):
    def __init__(self, model_name: str, input_size: int) -> None:
        super().__init__(model_name)
        self.encoder_layers = params["model"]["layers"]["encoder"]
        self.decoder_layers = params["model"]["layers"]["decoder"]
        self.save_hyperparameters()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, self.encoder_layers[0]),
            nn.ReLU(),
            nn.Linear(self.encoder_layers[0], self.encoder_layers[1]),
        )
        self.decoder = nn.Sequential(
            nn.Linear(self.decoder_layers[0], self.decoder_layers[1]),
            nn.ReLU(),
            nn.Linear(self.decoder_layers[1], input_size),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = slide_window(x)
        z = nn.Flatten(-2, -1)(z)
        z = self.encoder(z)
        z = self.decoder(z)
        z = reverse_slide_window(z)
        return z.view(x.shape)


class BaselineAE(Model):
    """
    Baseline AE model
    Source: https://github.com/nttcslab/dcase2023_task2_baseline_ae/blob/main/networks/dcase2023t2_ae/network.py
    """

    def __init__(self, model_name: str, input_size: int) -> None:
        super().__init__(model_name, input_size)
        self.encoder_layers = params["model"]["layers"]["encoder"]
        self.decoder_layers = params["model"]["layers"]["decoder"]
        self.save_hyperparameters()
        self.encoder = nn.Sequential(
            nn.Linear(self.input_size, self.encoder_layers[0]),
            nn.BatchNorm1d(self.encoder_layers[0], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.encoder_layers[0], self.encoder_layers[1]),
            nn.BatchNorm1d(self.encoder_layers[1], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.encoder_layers[1], self.encoder_layers[2]),
            nn.BatchNorm1d(self.encoder_layers[2], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.encoder_layers[2], self.encoder_layers[3]),
            nn.BatchNorm1d(self.encoder_layers[3], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.encoder_layers[3], self.encoder_layers[4]),
            nn.BatchNorm1d(self.encoder_layers[4], momentum=0.01, eps=1e-03),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(self.decoder_layers[0], self.decoder_layers[1]),
            nn.BatchNorm1d(self.decoder_layers[1], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.decoder_layers[1], self.decoder_layers[2]),
            nn.BatchNorm1d(self.decoder_layers[2], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.decoder_layers[2], self.decoder_layers[3]),
            nn.BatchNorm1d(self.decoder_layers[3], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.decoder_layers[3], self.decoder_layers[4]),
            nn.BatchNorm1d(self.decoder_layers[4], momentum=0.01, eps=1e-03),
            nn.ReLU(),
            nn.Linear(self.decoder_layers[4], self.input_size),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = slide_window(x)
        z = nn.Flatten(-2, -1)(z)
        z = self.encoder(z)
        z = self.decoder(z)
        z = reverse_slide_window(z)
        return z.view(x.shape)

In [8]:
tensor = torch.rand(309, 16, 128)

In [9]:
tensor.shape

torch.Size([309, 16, 128])

In [3]:
audio_dir = os.path.join("data", "raw", "dcase2023t2", "dev", "bearing", "train")
file_list = [
    os.path.join(audio_dir, file)
    for file in os.listdir(audio_dir)
]

dataset = AudioDataset(
    file_list=file_list,
)

train_loader = DataLoader(
    dataset,
    batch_size=2,
    num_workers=8,
    shuffle=True,
    drop_last=True,
)

input_size = dataset[0][0].shape[1:].numel()

In [5]:
model = BaselineAE("baseline", input_size)

In [7]:
for batch in train_loader:
    x, attributes = batch
    #x = nn.Flatten(0, 1)(x)
    print(x.shape)
    break

torch.Size([2, 1, 128, 313])


In [None]:
x

In [6]:
z = slide_window(x)
z.shape

torch.Size([309, 2, 128, 5])

In [8]:
# z = nn.Flatten(-2, -1)(z)
# z = self.encoder(z)
# z = self.decoder(z)
# z = reverse_slide_window(z)

In [10]:
x.shape

torch.Size([2, 1024, 128])

In [9]:
z.shape

torch.Size([124, 2, 1024, 5])

In [17]:
z = nn.Flatten(-2, -1)(z)
z = nn.Flatten(0, 1)(z)
z.shape

torch.Size([618, 640])

In [None]:
# def reverse_slide_window(decoded_tensor: torch.Tensor) -> torch.Tensor:
# Infer shapes and parameters

decoded_tensor = z

total_windows, flattened_feature_dim = decoded_tensor.shape
window_size = params["transform"]["params"]["window_size"]
feature_dim = flattened_feature_dim // window_size
center_idx = window_size // 2  # this should be 2
batch_size = params["train"]["batch_size"]

# Reshape the tensor to [batch_size, 309, feature_dim, window_size]
windows = total_windows // batch_size
reshaped_tensor = decoded_tensor.view(batch_size, windows, feature_dim, window_size)

# Take the nth value from all windows
center_values = reshaped_tensor[:, :, :, center_idx]

# Take 2 leftmost values from the first window and 2 rightmost values from the last window
left_values = reshaped_tensor[:, 0, :, :2]
right_values = reshaped_tensor[:, -1, :, -2:]

# Concatenate everything to reconstruct
reconstructed = torch.cat([left_values, center_values, right_values], dim=1)

# return reconstructed


In [8]:
import torch

# Sample 3D tensor: 5 batches of time sequences of length 10, with 3 features
tensor = torch.arange(80128).view(2, 128, 313)
tensor = tensor.transpose(1, 2)
print("Original Tensor:")
print(f"\nShape: {tensor.shape}")
#print(tensor)

def slide_window_3d(signal: torch.Tensor, window_size=5, stride=1) -> torch.Tensor:
    batch_size, length, feature_size = signal.shape
    num_windows = (length - window_size) // stride + 1
    windows = []
    for i in range(num_windows):
        window = signal[:, i * stride : i * stride + window_size, :]
        windows.append(window)
    return torch.stack(windows, dim=1)

def reverse_slide_window_3d(windowed_tensor: torch.Tensor) -> torch.Tensor:
    batch_size, num_windows, window_size, feature_size = windowed_tensor.shape
    center_idx = window_size // 2  # e.g., 2 for window size of 5
    
    # Take the nth value from all windows for all features
    center_values = windowed_tensor[:, :, center_idx, :]
    
    # Take the leftmost values from the first window for all batches and features
    left_values = windowed_tensor[:, 0, :center_idx, :]
    
    # Take the rightmost values from the last window for all batches and features
    right_values = windowed_tensor[:, -1, center_idx+1:, :]
    
    # Concatenate everything to reconstruct for each batch and feature
    reconstructed = torch.cat([left_values, center_values, right_values], dim=1)
    
    return reconstructed

# Apply sliding window
windowed_tensor = slide_window_3d(tensor)
print("\nWindowed Tensor:")
print(f"\nShape: {windowed_tensor.shape}")
#print(windowed_tensor)

# Apply reverse sliding window
reconstructed_tensor = reverse_slide_window_3d(windowed_tensor)
print("\nReconstructed Tensor:")
print(f"\nShape: {reconstructed_tensor.shape}")
#print(reconstructed_tensor)


Original Tensor:

Shape: torch.Size([2, 313, 128])

Windowed Tensor:

Shape: torch.Size([2, 309, 5, 128])

Reconstructed Tensor:

Shape: torch.Size([2, 313, 128])
