In [73]:
import os
import glob
import numpy as np
from IPython.display import Audio, display

import torch

import torchaudio
import torchaudio.transforms as T

from transformers import ASTFeatureExtractor()
from transformers import AutoConfig, AutoModelForAudioClassification, TrainingArguments, Trainer

from datasets import Audio, Dataset, DatasetDict, ClassLabel, load_metric

In [None]:
# Download and Unzip VC-PRG-1_5.zip
!wget http://cmp.felk.cvut.cz/data/audio_vc/audio/VC-PRG-1_5.zip
!unzip VC-PRG-1_5.zip
!rm VC-PRG-1_5.zip

In [None]:
# Download and Unzip VC-PRG-6.zip
!wget http://cmp.felk.cvut.cz/data/audio_vc/audio/VC-PRG-6.zip
!unzip VC-PRG-6.zip
!rm VC-PRG-6.zip

In [64]:
# feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
feature_extractor = ASTFeatureExtractor()



In [65]:
NUM_CLASSES = 15
label2id = {i: str(i) for i in range(NUM_CLASSES)}
id2label = {str(i): i for i in range(NUM_CLASSES)}

print(label2id)
print(id2label)

{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10', 11: '11', 12: '12', 13: '13', 14: '14'}
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, '11': 11, '12': 12, '13': 13, '14': 14}


In [66]:
train_path = "../VC-PRG-1_5/"
train_X = sorted(glob.glob(train_path + "*.wav"))
label_files = sorted(glob.glob(train_path + "*.txt"))

train_y = []
for file in label_files:
    with open(file, 'r') as f:
        content = f.readlines()
        if '-1' in content:
            train_y.append(0)
        else:
            train_y.append(len(content))

test_path = "../VC-PRG-6/"
test_X = sorted(glob.glob(test_path + "*.wav"))
label_files = sorted(glob.glob(test_path + "*.txt"))

test_y = []
for file in label_files:
    with open(file, 'r') as f:
        content = f.readlines()
        if '-1' in content:
            test_y.append(0)
        else:
            test_y.append(len(content))            

In [67]:
train_audio = Dataset.from_dict(
{
    'file': [os.path.basename(item) for item in train_X],
    'audio': train_X,
    'label': train_y,
}
).cast_column("audio", Audio(sampling_rate=16000))

test_audio = Dataset.from_dict(
{
    'file': [os.path.basename(item) for item in test_X],
    'audio': test_X,
    'label': test_y,
}
).cast_column("audio", Audio(sampling_rate=16000))

dataset = DatasetDict()
dataset["train"] = train_audio
dataset["test"] = test_audio
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 250
    })
    test: Dataset({
        features: ['file', 'audio', 'label'],
        num_rows: 172
    })
})

In [68]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,
        max_length=16000 * 20,
        truncation=True,
        )
    return inputs

In [69]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio", "file"], batched=True)
encoded_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 250
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 172
    })
})

In [70]:
num_labels = len(id2label)

config = AutoConfig.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    label2id=label2id,
    id2label=id2label,
    num_labels=num_labels,
)

model = AutoModelForAudioClassification.from_config(config)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['project_hid.weight', 'project_q.weight', 'project_hid.bias', 'quantizer.weight_proj.bias', 'project_q.bias', 'quantizer.codevectors', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['projector.weight', 'projector.bias', 'classifier.

In [71]:
model_name = "ast"

args = TrainingArguments(
    f"{model_name}-finetuned-ks",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

  return torch._C._cuda_getDeviceCount() > 0


In [74]:
def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [75]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [76]:
trainer.train()

***** Running training *****
  Num examples = 250
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 10
  Number of trainable parameters = 94572431


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
trainer.evaluate()