In [1]:
import librosa
import numpy as np
import time
import threading
import queue
import pickle

import fastdtw
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
########### PARAMETERS ###########
# DO NOT MODIFY
# Desired sample rate 16000 Hz
sample_rate = 16000
# Frame length
frame_length = 512

# debug flag
DEBUG = True

In [3]:
########### AUDIO FILE ###########
# DO NOT MODIFY
# Path to the audio file
audio_file = "audio_aaico_challenge.wav"

# Read the audio file and resample it to the desired sample rate
audio_data, current_sample_rate = librosa.load(
    audio_file, 
    sr=sample_rate,
)
audio_data_int16 = (audio_data * 32767).astype(np.int16) # converting audio data from 32-bit float to 16-bit int
number_of_frames = len(audio_data_int16) // frame_length # number of frames if audio split by 512 frame length
audio_data_int16 = audio_data_int16[:number_of_frames * frame_length] # trimming audio data into multiples of 512 
audio_duration = len(audio_data_int16) / sample_rate # calculating audio duration

In [4]:
### Loading model
device = "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe("audio_aaico_challenge.wav")
print(result["text"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 Engine 3, this is Bravo Team. We've got heavy smoke and flames visible from the Charlie side. We're going for a defensive attack, over. Galactic temperature. This is Bravo Team. Visibility's near zero, lots of heat. We're gonna hit Charlie hard from the exterior, then make entry from a primary search, over. Galactic battery. Galactic Battery. Keep an eye on that roof. Looks sketchy. Engine 3, better charge the hose. Over. Acknowledged. We've got a 2.5 inch line ready for a blitz attack. We'll knock down the bulk of the fire before we go in. Galactic Oxygen. Pass is active and we're staying on the comms. Going in for the attack now. Stay sharp out there. Over. Bravo going in. Over.


In [7]:
########### STREAMING SIMULATION ###########
# DO NOT MODIFY
results = np.zeros(shape=(3, len(audio_data_int16)), dtype=np.int64)
# Detection mask lines are SENT TIME, LABEL, RECEIVE TIME. 
buffer = queue.Queue()
start_event = threading.Event()

def label_samples(list_samples_id, labels):
    """
        Receives the index of samples for a frame 
        and allocates each sameple's label (0, 1) 
        and receive time.
    """
    receive_time = time.time_ns()
    results[1][list_samples_id] = labels
    results[2][list_samples_id] = receive_time

def notice_send_samples(list_samples_id):
    """
        Receives the index of samples for a frame and 
        allocates each sample's send time 
    """
    send_time = time.time_ns()
    results[0][list_samples_id] = send_time

def emit_data():
    """
        Each iteration of the loop generate a list of 
        indices of total length 512. Retreieve the frame
        from audio data using the indicies and inputting 
        into the buffer. 

        Sends the same set of indicies to "notice_send_samples" 
        to record sent time.
    """ 
    time.sleep(.5)
    print('Start emitting')
    start_event.set()
    for i in range(0, number_of_frames):
        list_samples_id = np.arange(i*frame_length, (i+1)*frame_length)
        time.sleep(frame_length / sample_rate) # Simulate real time
        frame = audio_data_int16[list_samples_id]
        
        # if DEBUG:
        #     print(list_samples_id)
        #     print(frame)

        buffer.put(frame)
        notice_send_samples(list_samples_id)
    print('Stop emitting')

def process_data():
    """
        Loop runs while the current frame's processed isn't equal to
        total number of frames. 
        
        Retrieves the frame from the buffer and generates the indicies list 
        for the samples. Generates the labels for the samples, sends both 
        parameters to the label_samples function to label which samples are commands
        and which are not.
    """
    i = 0
    start_event.wait()
    print('Start processing')
    while i != number_of_frames:
        start_time = time.time()
        frame = buffer.get() 
        
        result = pipe(frame)
        # list_samples_id = np.arange(i*frame_length, (i+1)*frame_length)
        # labels = [1 for _ in range(len(list_samples_id))]
        # label_samples(list_samples_id, labels)
        
        i += 1
        end_time = time.time()
        duration = (end_time - start_time) / 1000 # convert s to ms
        
        print("Processed Frame: {} | Result: {} | Time Taken: {}".format(i, result["text"], duration))
        
    print('Stop processing')
    # Save the list to a file
    with open('results.pkl', 'wb') as file:
        pickle.dump(results, file)


In [8]:
time_measurement = []

thread_process = threading.Thread(target=process_data)
thread_emit = threading.Thread(target=emit_data)

thread_process.start()
thread_emit.start()

Start emitting
Start processing
Processed Frame: 1 | Result:  Продолжение следует... | Time Taken: 0.011759534120559693
Processed Frame: 2 | Result:  Subtitles by the Amara.org community | Time Taken: 0.012067009449005127
Processed Frame: 3 | Result:  Thank you. | Time Taken: 0.010975581645965576
Processed Frame: 4 | Result:  Thank you. | Time Taken: 0.010955400228500367
Processed Frame: 5 | Result:  Thank you. | Time Taken: 0.011103220462799072
Stop emitting
Processed Frame: 6 | Result:  Thank you. | Time Taken: 0.011428686857223511
Processed Frame: 7 | Result:  Thank you. | Time Taken: 0.01116071367263794
Processed Frame: 8 | Result:  Takk for ating medietekst. | Time Taken: 0.01226189351081848
Processed Frame: 9 | Result:  I'm sorry. | Time Taken: 0.011101377725601196
