In [1]:
import librosa
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from transformers import AutoFeatureExtractor, ASTForAudioClassification, ASTModel

extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch 

class CustomModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.fc1 = torch.nn.Linear(in_features=527, out_features=128)
        self.fc2 = torch.nn.Linear(in_features=128, out_features=1)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        x = self.model(**x, return_dict=False)
        x = self.fc1(x[0])
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [3]:
model = CustomModel(model)

In [4]:
# Desired sample rate 16000 Hz
sample_rate = 16000

# Frame length
frame_length = 512

audio_file = "audio_aaico_challenge.wav"

# Read the audio file and resample it to the desired sample rate
audio_data, current_sample_rate = librosa.load(
    audio_file,
    sr=sample_rate,
)
audio_data_int16 = (audio_data * 32767).astype(np.int16)

number_of_frames = len(audio_data_int16) // frame_length

audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate

command_samples = [
    [142000, 160000],
    [340000, 360000],
    [620000, 635000]
]

ground_truth = np.ones(len(audio_data_int16))
for i in range(len(audio_data_int16)):
    if any([i >= e[0] and i <= e[1] for e in command_samples]):
        ground_truth[i] = 0

In [5]:
audio_clips_indexes = pd.read_csv("audio_to_frame_index.csv", index_col=0, header=0)

In [6]:
audio_clips_indexes = np.array(audio_clips_indexes)

In [7]:
import torch
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [8]:
import torchaudio
from torch.utils.data import Dataset, DataLoader

class AudioDataset(Dataset):
    def __init__(self, audio_data_file_list, labels):
        self.audio_data_file_list = audio_data_file_list
        self.labels = labels

    def __len__(self):
        return self.audio_data_file_list.shape[0]

    def __getitem__(self, idx):
        unique_values = np.unique(self.labels[idx*512:(idx+1)*512], return_counts=True)
        label = unique_values[0][np.argmax(unique_values[1])]
        audio_data, sr = librosa.load(self.audio_data_file_list[idx], sr=sample_rate)
        processed_waveform = extractor(audio_data, sampling_rate=16000, return_tensors="pt")
        # Process waveform to the format your model expects
        # e.g., resampling, extracting features like mel-spectrogram
        return processed_waveform, label

# Instantiate the Dataset and DataLoader
train_dataset = AudioDataset(audio_clips_indexes[:, 0], ground_truth)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False)


In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [10]:
import torch.nn.functional as F

criterion = torch.nn.BCELoss()

In [11]:
class_weights = torch.Tensor([0.7, 0.3]).to(device)

In [12]:
class_weights

tensor([0.7000, 0.3000], device='cuda:0')

In [13]:
model = model.to(device)
model.train()
num_epochs = 3
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, labels = batch
        inputs = {k: v.squeeze(1).to(device) for k, v in inputs.items()}
        labels = labels.to(device)
        logits = model(inputs)
        logits = logits.squeeze(0)
        labels = labels.type(torch.cuda.FloatTensor)
        criterion.weight = torch.Tensor([class_weights[label.squeeze().to(device).int()] for label in labels]).to(device)
        logits = logits.squeeze(-1) if len(logits.shape) > 1 else logits
        loss = criterion(logits, labels)
        print(loss)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}')


tensor(0.0936, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0086, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0028, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0015, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0009, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0006, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0004, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0003, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0002, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0002, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0001, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(0.0001, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(9.8185e-05, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
tensor(8.4662e-05, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>

In [14]:
torch.save(model.state_dict(), 'ast_model_weights.pkl')

# Processing the audio frames

In [15]:
import librosa
import numpy as np
import time
import threading
import queue
import pickle
import torch
from transformers import AutoFeatureExtractor
from math import ceil

extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sample_rate = 16000
audio_file = "audio_aaico_challenge.wav"
audio_data, _ = librosa.load(audio_file, sr=sample_rate)
audio_data_int16 = (audio_data * 32767).astype(np.int16)

frame_length = 512
number_of_frames = len(audio_data_int16) // frame_length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]

results = np.zeros(shape=(3, len(audio_data_int16)), dtype=np.int64)
buffer = queue.Queue()
start_event = threading.Event()

def label_samples(list_samples_id, labels):
    receive_time = time.time_ns()
    results[1, list_samples_id] = labels
    results[2, list_samples_id] = receive_time

def notice_send_samples(list_samples_id):
    send_time = time.time_ns()
    results[0, list_samples_id] = send_time

def emit_data(): 
    time.sleep(.5)
    print('Start emitting')
    start_event.set()
    for i in range(0, number_of_frames):
        list_samples_id = np.arange(i*frame_length, (i+1)*frame_length)
        time.sleep(frame_length / sample_rate)  # Simulate real time
        frame = audio_data_int16[list_samples_id]
        buffer.put((i, frame))  # Include frame index
    print('Stop emitting')

def process_data():
    i = 0
    model.eval() 
    start_event.wait()
    print('Start processing')
    while i != number_of_frames:
        frame = buffer.get()
        list_samples_id = np.arange(i*frame_length, (i+1)*frame_length)
        notice_send_samples(list_samples_id)
        inputs = extractor(frame[1].astype(np.float32), sampling_rate=sample_rate, return_tensors="pt")
        inputs = {k: v.squeeze(1).to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(inputs).item()
            predictions = np.round(outputs)
            predictions = [predictions]*frame_length
            label_samples(list_samples_id, predictions)
        i += 1
    print('Stop processing')
    with open('results.pkl', 'wb') as file:
        print('Saving results')
        pickle.dump(results, file)

# Start threads for emitting and processing data
threading.Thread(target=emit_data).start()
threading.Thread(target=process_data).start()

Start emitting
Start processing
Stop emitting
Stop processing
Saving results


In [16]:
command_samples = [
    [142000, 160000],
    [340000, 360000],
    [620000, 635000]
]

nb_command_samples = sum([elem[1] - elem[0] for elem in command_samples])
ground_truth = np.ones(len(audio_data_int16))
for i in range(len(audio_data_int16)):
    if any([i >= e[0] and i <= e[1] for e in command_samples]):
        ground_truth[i] = 0
overrun_times_ms = (results[2] - results[0]) / 1e6
labels = results[1]

In [17]:
assert np.all(np.diff(results[2]) >= 0) # Labelling has been done sequentially
assert np.all(overrun_times_ms <= 50) # Processing took less than 50 ms for each sample

AssertionError: 

In [19]:
labels.sum()

964096

In [22]:
slow_sample_labelling_thres = 20
command_ratio = nb_command_samples / len(audio_data_int16)
communication_ratio = 1 - nb_command_samples / len(audio_data_int16)

score = len(audio_data_int16)
for i in range(len(audio_data_int16)):
    if overrun_times_ms[i] >= slow_sample_labelling_thres:
        score -= 1
    else:
        if ground_truth[i] == 0 and labels[i] != 0: # unintentional broadcast
            score -= int(1 / command_ratio)
        if ground_truth[i] == 1 and labels[i] != 1: # lost communication
            score -= int(1 / communication_ratio)
print(f'Score: {score / len(audio_data_int16)}')

Score: 0.0
