In [14]:
import librosa
import numpy as np
import time
import threading
import queue
import pickle

import fastdtw
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [15]:
########### PARAMETERS ###########
# DO NOT MODIFY
# Desired sample rate 16000 Hz
sample_rate = 16000
# Frame length
frame_length = 512

In [16]:
# import audio

audio_data, audio_sr = librosa.load(
    "F:\Code Repos\\AAICO-Voice-Competition\\audio_aaico_challenge.wav",
    sr=sample_rate
)

audio_data_int16 = (audio_data * 32767).astype(np.int16) # converting audio data from 32-bit float to 16-bit int
number_of_frames = len(audio_data_int16) // frame_length # number of frames if audio split by 512 frame length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length] # trimming audio data into multiples of 512 
audio_duration = len(audio_data_int16) / sample_rate # calculating audio duration

  "F:\Code Repos\\AAICO-Voice-Competition\\audio_aaico_challenge.wav",


In [17]:
# import test audios

template_test, template_sr = librosa.load(
    "F:\Code Repos\\AAICO-Voice-Competition\\audio-files\\galactic_main_test.wav",
    sr=sample_rate
)

test1_data, test1_sr = librosa.load(
    "F:\Code Repos\\AAICO-Voice-Competition\\audio-files\\galactic_sample.wav",
    sr=sample_rate
)

test2_data, test2_sr = librosa.load(
    "F:\Code Repos\\AAICO-Voice-Competition\\audio-files\\test1.wav",
    sr=sample_rate
)

  "F:\Code Repos\\AAICO-Voice-Competition\\audio-files\\galactic_main_test.wav",
  "F:\Code Repos\\AAICO-Voice-Competition\\audio-files\\galactic_sample.wav",
  "F:\Code Repos\\AAICO-Voice-Competition\\audio-files\\test1.wav",


In [23]:
distance, _ = fastdtw.fastdtw(template_test.T, test1_data.T)
distance

1119.4487441063716

In [28]:
########### STREAMING SIMULATION ###########
# DO NOT MODIFY
results = np.zeros(shape=(3, len(audio_data_int16)), dtype=np.int64)
# Detection mask lines are SENT TIME, LABEL, RECEIVE TIME. 
buffer = queue.Queue()
start_event = threading.Event()

def label_samples(list_samples_id, labels):
    """
        Receives the index of samples for a frame 
        and allocates each sameple's label (0, 1) 
        and receive time.
    """
    receive_time = time.time_ns()
    results[1][list_samples_id] = labels
    results[2][list_samples_id] = receive_time

def notice_send_samples(list_samples_id):
    """
        Receives the index of samples for a frame and 
        allocates each sample's send time 
    """
    send_time = time.time_ns()
    results[0][list_samples_id] = send_time

def emit_data():
    """
        Each iteration of the loop generate a list of 
        indices of total length 512. Retreieve the frame
        from audio data using the indicies and inputting 
        into the buffer. 

        Sends the same set of indicies to "notice_send_samples" 
        to record sent time.
    """ 
    time.sleep(.5)
    print('Start emitting')
    start_event.set()
    for i in range(0, number_of_frames):
        list_samples_id = np.arange(i*frame_length, (i+1)*frame_length)
        time.sleep(frame_length / sample_rate) # Simulate real time
        frame = audio_data_int16[list_samples_id]
        
        # if DEBUG:
        #     print(list_samples_id)
        #     print(frame)

        buffer.put(frame)
        notice_send_samples(list_samples_id)
    print('Stop emitting')

def process_data():
    """
        Loop runs while the current frame's processed isn't equal to
        total number of frames. 
        
        Retrieves the frame from the buffer and generates the indicies list 
        for the samples. Generates the labels for the samples, sends both 
        parameters to the label_samples function to label which samples are commands
        and which are not.
    """
    i = 0
    start_event.wait()
    print('Start processing')
    distances=[]
    while i != number_of_frames:
        start_time = time.time()
        frame = buffer.get() 
        
        distance, _ = fastdtw.fastdtw(frame, test1_data)
        distances.append(distance)
        
        if distance < 100:
            list_samples_id = np.arange(i*frame_length, (i+1)*frame_length)
            labels = [1 for _ in range(len(list_samples_id))]
            label_samples(list_samples_id, labels)
        
        i += 1
        end_time = time.time()
        duration = (end_time - start_time) / 1000 # convert s to ms
        
        if distance < 100:
            print("Processed Frame: {} | Distance: {} | Time Taken: {}".format(i, distance, duration))        
        # print("Processed Frame: {} | Distance: {} | Time Taken: {}".format(i, distance, duration))
        
    min_dist = min(distances)
    print("Minimum distance found: {} at Frame: {}".format(min_dist, distances.index(min_dist)))
    print('Stop processing')
    # Save the list to a file
    with open('results.pkl', 'wb') as file:
        pickle.dump(results, file)


In [29]:
time_measurement = []

thread_process = threading.Thread(target=process_data)
thread_emit = threading.Thread(target=emit_data)

thread_process.start()
thread_emit.start()

Start emitting
Start processing
