# New keywords

For this project, although we can already do some keyword spotting based on google keywords, we will need to use other keywords that are not present in this dataset. For exmaple words like sit, follow, stand etc.. They could be a big step forward in terms of human robot interaction as using other words wouldnt make much sense for the human. (although a real dog could arguabely interstand any word as a command as long as this word has always been taught him the same).

### Record new keywords

We will need to create a small dataset of keywords that will be used to train the classifier. A deep learning approach is will not necessarly be the best candidate here due to the limited amount of data I will be able to gather

In [1]:
import pyaudio
import time
import wave


In [2]:
def list_devices():
    p = pyaudio.PyAudio()
    device_count = p.get_device_count()
    for i in range(0, device_count):
        info = p.get_device_info_by_index(i)
        print("Device {} = {}".format(info["index"], info["name"]))


In [3]:
list_devices()

Device 0 = Microsoft Sound Mapper - Input
Device 1 = Headset Microphone (HyperX Virt
Device 2 = VoiceMeeter Aux Output (VB-Audi
Device 3 = VoiceMeeter Output (VB-Audio Vo
Device 4 = Microphone (2- Aukey-PC-LM1E Au
Device 5 = VoiceMeeter VAIO3 Output (VB-Au
Device 6 = Microsoft Sound Mapper - Output
Device 7 = Headset Earphone (HyperX Virtua
Device 8 = C24F390 (NVIDIA High Definition
Device 9 = Realtek Digital Output (Realtek
Device 10 = VoiceMeeter Aux Input (VB-Audio
Device 11 = VoiceMeeter VAIO3 Input (VB-Aud
Device 12 = Haut-parleurs (Realtek(R) Audio
Device 13 = ROG PG278QR (NVIDIA High Defini
Device 14 = VoiceMeeter Input (VB-Audio Voi
Device 15 = Pilote de capture audio principal
Device 16 = Headset Microphone (HyperX Virtual Surround Sound)
Device 17 = VoiceMeeter Aux Output (VB-Audio VoiceMeeter AUX VAIO)
Device 18 = VoiceMeeter Output (VB-Audio VoiceMeeter VAIO)
Device 19 = Microphone (2- Aukey-PC-LM1E Audio)
Device 20 = VoiceMeeter VAIO3 Output (VB-Audio VoiceMeeter VAIO3)
De

In [4]:
# Records audio from microphone for 1 sec at 16kHz sampling rate

PATH = "custom_kw/"
keyword = f"sit {time.time()}"
CHUNK = 320  # number of audio samples per frame (arbitrary value)
FORMAT = pyaudio.paInt16  # audio format
CHANNELS = 1  # mono audio
RATE = 16000  # sampling rate in Hz
RECORD_SECONDS = 1  # duration of each recording in seconds
FILE_NAME = PATH+f"{keyword}.wav"

frames = []

print("recording starts in 3 seconds")
time.sleep(1)
print("2")
time.sleep(1)
print("1")
time.sleep(1)

p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=1)
print("recording")
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)
print("finished recording")

stream.stop_stream()
stream.close()
p.terminate()

waveFile = wave.open(FILE_NAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(p.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()


recording starts in 3 seconds
2
1
recording
finished recording


### Create dataset from current audio folder

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [5]:
import os
import tqdm
from pathlib import Path
import pandas as pd
import torchaudio
import librosa
import IPython.display as ipd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric

PATH_TO_AUDIO = "google_speech_recognition_v2"

  from .autonotebook import tqdm as notebook_tqdm


source: https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb#scrollTo=-gh7fQ1XEpC7

In [6]:
data = []

for subdir, dirs, files in os.walk(PATH_TO_AUDIO):
    for file in files:
        if file.endswith(".wav"):
            name = file.split(".")[0]
            label = subdir.split("\\")[-1]
            path = os.path.join(subdir, file)
            data.append({
                "name": name,
                "path": path,
                "keyword": label,
            })

In [7]:
df = pd.DataFrame(data)
print("Labels: ", df["keyword"].unique())
print()
#df.groupby("keyword").count()[["path"]]

Labels:  ['backward' 'bed' 'bird' 'cat' 'dog' 'down' 'eight' 'five' 'follow'
 'forward' 'four' 'go' 'happy' 'house' 'learn' 'left' 'marvin' 'nine' 'no'
 'off' 'on' 'one' 'right' 'seven' 'sheila' 'six' 'stop' 'three' 'tree'
 'two' 'up' 'visual' 'wow' 'yes' 'zero' '_background_noise_']



In [8]:
# we only choose the keywords we want
desired_keywords = ["follow", "go", "happy", "marvin", "stop", "down"]
df = df[df["keyword"].isin(desired_keywords)]
print("Labels: ", df["keyword"].unique())
print()
df.groupby("keyword").count()[["path"]]

Labels:  ['down' 'follow' 'go' 'happy' 'marvin' 'stop']



Unnamed: 0_level_0,path
keyword,Unnamed: 1_level_1
down,3917
follow,1579
go,3880
happy,2054
marvin,2100
stop,3872


In [9]:
idx = np.random.randint(0, len(df))
sample = df.iloc[idx]
path = sample["path"]
label = sample["keyword"]


print(f"ID Location: {idx}")
print(f"      Label: {label}")
print()

speech, sr = torchaudio.load(path)
speech = speech[0].numpy().squeeze()
# speech = librosa.resample(np.asarray(speech), sr, 16_000) # audio is already at 16kHz
ipd.Audio(data=np.asarray(speech), autoplay=True, rate=16000)

ID Location: 8959
      Label: go



In [10]:
# create csv files to be used to load data
save_path = "gsr_v2_cleaned"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["keyword"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

(13921, 3)
(3481, 3)


In [11]:
# Loading the created dataset using datasets

data_files = {
    "train": save_path+"/train.csv", 
    "validation": save_path+"/test.csv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

Downloading and preparing dataset csv/default to C:/Users/eliot/.cache/huggingface/datasets/csv/default-a369859cf17b7845/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 2006.84it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 167.12it/s]
                                                             

Dataset csv downloaded and prepared to C:/Users/eliot/.cache/huggingface/datasets/csv/default-a369859cf17b7845/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 111.41it/s]

Dataset({
    features: ['name', 'path', 'keyword'],
    num_rows: 13921
})
Dataset({
    features: ['name', 'path', 'keyword'],
    num_rows: 3481
})





In [12]:
# We need to specify the input and output column
input_column = "path"
output_column = "keyword"

In [13]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

A classification problem with 6 classes: ['down', 'follow', 'go', 'happy', 'marvin', 'stop']


In [14]:
from transformers import AutoConfig, Wav2Vec2Processor

In [15]:
model_name_or_path = "facebook/wav2vec2-base"

In [16]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_ks",
)



In [17]:
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path,)
target_sampling_rate = processor.feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

The target sampling rate: 16000


In [18]:
def speech_file_to_array(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

In [34]:
def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate, padding=True)
    print(result)
    result["labels"] = list(target_list)

    return result

In [20]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=1
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=1
)

                                                                   

In [21]:
train_dataset

Dataset({
    features: ['name', 'path', 'keyword', 'input_values', 'labels'],
    num_rows: 13921
})

In [31]:
idx = 0
print(f"Training input_values: {len(train_dataset[idx]['input_values'])}")
# print(f"Training attention_mask: {train_dataset[idx]['attention_mask']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['keyword']}")

Training input_values: 16000
Training labels: 0 - down


In [23]:
label2id, id2label = dict(), dict()
for i, label in enumerate(label_list):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [24]:
id2label[str(2)]

'go'

In [25]:
import evaluate
accuracy = evaluate.load("accuracy")

In [26]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [27]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.codevectors', 'project_q.weight', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_hid.weight', 'quantizer.weight_proj.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'classifier.weight', 'classifi

In [37]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics,
)

trainer.train()

OSError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.