# **AAICO February 2024 Voice Processing Challenge**

### Install

In [3]:
! pip install --upgrade pip
! pip install librosa

Collecting pip
  Downloading pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.2
    Uninstalling pip-23.3.2:
      Successfully uninstalled pip-23.3.2
Successfully installed pip-24.0


### Import

In [1]:
import librosa
import numpy as np
import time
from IPython.display import Audio, display
import plotly.express as px
import pickle

### **Audio parameters**

In [3]:
# Desired sample rate 16000 Hz
sample_rate = 16000

# Frame length
frame_length = 512

### **Open audio file**

In [4]:
# Path to the audio file
audio_file = "audio_aaico_challenge.wav"

# Read the audio file and resample it to the desired sample rate
audio_data, current_sample_rate = librosa.load(
    audio_file,
    sr=sample_rate,
)
audio_data_int16 = (audio_data * 32767).astype(np.int16)

number_of_frames = len(audio_data_int16) // frame_length

audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate

In [None]:
audio_data_int16

array([4, 7, 5, ..., 0, 0, 0], dtype=int16)

### **Play and visualise audio**

In [None]:
display(Audio(audio_data_int16, rate=16000))

In [20]:
from IPython.display import Audio
import numpy as np

# Assuming 'audio_data' contains your audio data
# Modify 'start_sample' and 'end_sample' based on the range you want to play
start_sample = 620000
end_sample = 637952

Audio(np.array(audio_data[start_sample:end_sample]), rate=sample_rate)


In [None]:
fig = px.scatter(audio_data_int16, title="Input audio")
fig.show()

### **Reference constants**

In [4]:
command_samples = [
    [142000, 160000],
    [340000, 360000],
    [620000, 635000]
]

nb_command_samples = sum([elem[1] - elem[0] for elem in command_samples])

#### **Ground truth**

We establish the ground truth of the labels as follows:
- Voice command samples are labeled as 0.
- Everything that is not a command is labeled as 1.

In [5]:
ground_truth = np.ones(len(audio_data_int16))
for i in range(len(audio_data_int16)):
    if any([i >= e[0] and i <= e[1] for e in command_samples]):
        ground_truth[i] = 0

In [None]:
fig = px.scatter(audio_data_int16, title="Input audio")
for elem in command_samples:
    fig.add_vline(x=elem[0], line_color="red")
    fig.add_vline(x=elem[1], line_color="red")
fig.show()

### **Evaluate results**

Import result file

In [6]:
result_path = 'results.pkl'

# Open and read the list from the file
with open(result_path, 'rb') as file:
    results = pickle.load(file)

Calculate the overrun time for every sample, extract the labels.

In [7]:
overrun_times_ms = (results[2] - results[0]) / 1e6
labels = results[1]

#### **Assertions**

A solution is valid if:
- Samples have been labelled sequentially
- Each sample has been processed in less than 50 ms

These constraints aim to enforce the real time simulation.

In [8]:
assert np.all(np.diff(results[2]) >= 0) # Labelling has been done sequentially
assert np.all(overrun_times_ms <= 50) # Processing took less than 50 ms for each sample

AssertionError: 

#### **Scoring**

The score is calculated by penalizing:
- The samples that have been processed in more than 20 ms.
- The samples of commands that have been broadcast.
- The samples of communications that have not been broadcast.

In [9]:
slow_sample_labelling_thres = 20
command_ratio = nb_command_samples / len(audio_data_int16)
communication_ratio = 1 - nb_command_samples / len(audio_data_int16)

score = len(audio_data_int16)

for i in range(len(audio_data_int16)):
    if overrun_times_ms[i] >= slow_sample_labelling_thres:
        score -= 1
    else:
        if ground_truth[i] == 0 and labels[i] != 0: # unintentional broadcast
            score -= int(1 / command_ratio)
        if ground_truth[i] == 1 and labels[i] != 1: # lost communication
            score -= int(1 / communication_ratio)

print(f'Score: {score / len(audio_data_int16)}')

Score: 0.008185906797663303


In [16]:
# Desired sample rate 16000 Hz
sample_rate = 16000

# Frame length
frame_length = 512

# Path to the audio file
audio_file = "audio_aaico_challenge.wav"

# Read the audio file and resample it to the desired sample rate
audio_data, current_sample_rate = librosa.load(
    audio_file,
    sr=sample_rate,
)
audio_data_int16 = (audio_data * 32767).astype(np.int16)

number_of_frames = len(audio_data_int16) // frame_length

audio_data_int16 = audio_data_int16[:number_of_frames * frame_length]
audio_duration = len(audio_data_int16) / sample_rate

command_samples = [
    [142000, 160000],
    [340000, 360000],
    [620000, 635000]
]

nb_command_samples = sum([elem[1] - elem[0] for elem in command_samples])

ground_truth = np.ones(len(audio_data_int16))
for i in range(len(audio_data_int16)):
    if any([i >= e[0] and i <= e[1] for e in command_samples]):
        ground_truth[i] = 0

result_path = 'results.pkl'

# Open and read the list from the file
with open(result_path, 'rb') as file:
    results = pickle.load(file)
    
overrun_times_ms = (results[2] - results[0]) / 1e6
labels = results[1]

slow_sample_labelling_thres = 20
command_ratio = nb_command_samples / len(audio_data_int16)
communication_ratio = 1 - nb_command_samples / len(audio_data_int16)

score = len(audio_data_int16)

unintentional_broadcasts, lost_communications = 0, 0

for i in range(len(audio_data_int16)):
    if overrun_times_ms[i] >= slow_sample_labelling_thres:
        score -= 1
    else:
        if ground_truth[i] == 0 and labels[i] != 0: # unintentional broadcast
            # print("@", i, labels[i])
            score -= int(1 / command_ratio)
            unintentional_broadcasts += 1
        if ground_truth[i] == 1 and labels[i] != 1: # lost communication
            score -= int(1 / communication_ratio)
            lost_communications += 1

print(f'Score: {score / len(audio_data_int16)}')
print(f'Unintentional broadcasts: {unintentional_broadcasts}')
print(f'Lost communications: {lost_communications}')

Score: 0.49112121614445037
Unintentional broadcasts: 25376
Lost communications: 1536
