In [2]:
import pandas as pd
import librosa
import os
import numpy as np
import torch
from torch.nn import L1Loss, MSELoss
from enum import Enum

from src.utils import get_git_root
import src.datasets as ds

ModuleNotFoundError: No module named 'constants'

In [70]:
# specify paths for labels and dataset
data_paths = os.path.join(get_git_root(os.getcwd()), 'src' ,'Data')
LABELS_KEYPHRASES_CSV_PATH = os.path.join(data_paths, "Keyphrases" , "labels.csv")
TEDLIUM_WAV_PATH = os.path.join(data_paths, "TEDLIUM_release-3", "data", "sph")
MSWC_WAV_PATH = os.path.join(data_paths, "MSWCcc", "en", "clips")

In [63]:
# Read the labels into a dataframe
labels_df = pd.read_csv(LABELS_KEYPHRASES_CSV_PATH)

In [64]:
labels_df[labels_df["TEDLIUM_SET"] == "MarvinMinsky_2003"]

Unnamed: 0,Keyword,TEDLIUM_SampleID,TED_TALK_ID,TEDLIUM_SET,MSWC_AudioID,start_time,end_time,confidence


# Util

In [65]:
def features_extractor(audio): 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features

In [66]:
def window_split(frame_len, hop_len,data, print_frame = False):
    frames = librosa.util.frame(data, frame_length=frame_len, hop_length=hop_len)
    windowed_frames = np.hanning(frame_len).reshape(-1, 1) * frames
    sum_len=0
    if print_frame:
        # Print frames
        for i, frame in enumerate(frames):
            print("Frame {}: {}".format(i, frame))
            print("Length of frame :{}".format(len(frame)))
            sum_len+=len(frame)
        print("All frames :{}".format(sum_len))
        print("More data than original size: {}".format(len(frames) <= sum_len))
        # Print windowed frames
        for i, frame in enumerate(windowed_frames):
            print("Win Frame {}: {}".format(i, np.round(frame, 3)))
        
    return frames,windowed_frames

In [67]:
def mae_loss(x, y):
    loss = L1Loss()
    return torch.reshape(loss(x,y), shape=(1,1))

def rmse_loss(x,y):
    loss = MSELoss()
    return torch.reshape(torch.sqrt(loss(x,y)), shape=(1,1))

class LossType(Enum):
    MAE = "mae"
    RMSE = "rmse"


def compare_window(keyword, window, loss_type=LossType.MAE):

    if loss_type == LossType.MAE:
        return mae_loss(keyword, window)

    elif loss_type == LossType.RMSE:
        return rmse_loss(keyword, window)

def match_audio(keyword, sliding_windows, loss_type=LossType.MAE):
    results = []
    for i in range(len(sliding_windows)):
        t = sliding_windows[i]
        loss = compare_window(keyword, t, loss_type=loss_type)
        results.append(loss)
    return torch.cat(results, dim=0)

In [68]:
labels_df.head()

Unnamed: 0,Keyword,TEDLIUM_SampleID,TED_TALK_ID,TEDLIUM_SET,MSWC_AudioID,start_time,end_time,confidence
0,because,0,train,911Mothers_2010W,because/common_voice_en_97853.opus,15.03125,15.498812,0.934554
1,the,1,train,911Mothers_2010W,the/common_voice_en_207024__2.opus,16.621125,16.741375,0.836357
2,fact,1,train,911Mothers_2010W,the/common_voice_en_207024__2.opus,16.821563,17.062062,0.998591
3,that,1,train,911Mothers_2010W,the/common_voice_en_207024__2.opus,17.102187,17.2425,0.99976
4,have,1,train,911Mothers_2010W,the/common_voice_en_207024__2.opus,17.523125,17.843813,0.944685


# Create Dataset

In [75]:
# List all the files in the Tedlium wav directory
ted_wav_files = os.listdir(TEDLIUM_WAV_PATH)

ted_data = []

for file in ted_wav_files:
    file_path = os.path.join(TEDLIUM_WAV_PATH, file)
    audio, sample_rate_ted = librosa.load(file_path, res_type='kaiser_fast')
    # Retrieve the keywords belonging to this audio:
    # [:-8] removes the last four characters (here the file extension) from the filename
    file_stem = file[:-4]
    # Locate the rows in the labels dataframe belonging to this recording based on the file stem
    labels_rows = labels_df[labels_df["TEDLIUM_SET"] == file_stem]

    # If the audio file does not have a corresponding row in the data frame, just continue
    if len(labels_rows) < 1:
        continue
        
    print(audio)
    # load the mswc waveforms into audio for each row and save to ted_data (which will later be used for training)
    for index, row in labels_rows.iterrows():
        #retrieve the keyword and file name
        keyword = row["Keyword"]
        keyword_file_path = os.path.join(MSWC_WAV_PATH, row["MSWC_AudioID"])
        keyword_audio, sample_rate_keyword = librosa.load(keyword_file_path, res_type='kaiser_fast') 
        
        # Get the audio length of the keyword so that we can window the data
        keyword_len = len(keyword_audio)
        
        # Convert the keyword audio to normalized mfcc
        mfcc_keyword_audio = features_extractor(keyword_audio)
        
        # Window the tedlium input data 
        frames, window_frames = window_split(keyword_len, 1, audio)
        mfcc_audio = features_extractor(window_frames)
        
        start_time = row["start_time"]
        end_time = row["end_time"]
        # Audio, keyword audio, start_time, end_time
        new_row = [mfcc_audio, mfcc_keyword_audio, start_time, end_time]
        ted_data.append(new_row)
        
    break

[ 2.7778121e-06 -8.6522118e-07 -5.0232247e-06 ... -6.1102759e-07
  1.3981060e-06  0.0000000e+00]


MemoryError: Unable to allocate 3.90 TiB for an array with shape (22050, 24292320) and data type float64

In [8]:
# Convert ted_data to a numpy array
ted_data_np = np.asarray(ted_data)

# Divide samples and labels
X = ted_data_np[:,:2]
y = ted_data_np[:,2:]

'hello'

# Evaluation

In [None]:
# If the model detects the keyword within one millisecond of the actual label, return true and otherwise return false
# This way we can compute the accuracy of the model (as opposed to e.g. the error)

