<a href="https://colab.research.google.com/github/ZahraDehghani99/Audio_classification/blob/main/Audio_classification_on_Intent_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://huggingface.co/docs/transformers/tasks/audio_classification#audio-classification

In [1]:
!pip install -q transformers datasets evaluate

In [2]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")



In [4]:
minds

Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 563
})

In [5]:
minds = minds.train_test_split(test_size=0.2)

In [6]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

In [7]:
minds['train'].features

{'path': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=8000, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None),
 'english_transcription': Value(dtype='string', id=None),
 'intent_class': ClassLabel(names=['abroad', 'address', 'app_error', 'atm_limit', 'balance', 'business_loan', 'card_issues', 'cash_deposit', 'direct_debit', 'freeze', 'high_value_payment', 'joint_account', 'latest_transactions', 'pay_bill'], id=None),
 'lang_id': ClassLabel(names=['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN'], id=None)}

In [8]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

In [9]:
minds

DatasetDict({
    train: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 450
    })
    test: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 113
    })
})

In [10]:
minds["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~CARD_ISSUES/602ba65cbb1e6d0fbce9209a.wav',
  'array': array([ 0.        ,  0.        ,  0.00024414, ..., -0.00024414,
          0.        ,  0.        ], dtype=float32),
  'sampling_rate': 8000},
 'intent_class': 6}

In [11]:
labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [12]:
id2label[str(2)]

'app_error'

In [13]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



In [14]:
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [15]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~CARD_ISSUES/602ba65cbb1e6d0fbce9209a.wav',
  'array': array([-2.6319238e-05, -5.4239896e-05,  2.7109292e-05, ...,
          4.3437420e-05,  2.6976457e-05, -1.8893874e-05], dtype=float32),
  'sampling_rate': 16000},
 'intent_class': 6}

In [16]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [17]:
minds

DatasetDict({
    train: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 450
    })
    test: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 113
    })
})

In [18]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
encoded_minds

DatasetDict({
    train: Dataset({
        features: ['intent_class', 'input_values'],
        num_rows: 450
    })
    test: Dataset({
        features: ['intent_class', 'input_values'],
        num_rows: 113
    })
})

In [20]:
encoded_minds['train'][0]

{'intent_class': 6,
 'input_values': [0.0006347077433019876,
  0.00038001517532393336,
  0.001122083282098174,
  0.0023715277202427387,
  0.0028567605186253786,
  0.002144683850929141,
  0.001107511343434453,
  0.0006449008360505104,
  0.0006647933623753488,
  0.0007330999942496419,
  0.0010521183721721172,
  0.0019936594180762768,
  0.0029661410953849554,
  0.00271259224973619,
  0.000961493409704417,
  -0.0009227991686202586,
  -0.001384628820233047,
  -0.00037016160786151886,
  0.0008499175310134888,
  0.001252811518497765,
  0.0009571618866175413,
  0.0006645022658631206,
  0.0007374011911451817,
  0.000986007391475141,
  0.001062094117514789,
  0.0008643547189421952,
  0.0006450734217651188,
  0.0007489360286854208,
  0.001137445797212422,
  0.0012479724828153849,
  0.0005901537952013314,
  -0.0005050374893471599,
  -0.0010574492625892162,
  -0.0005172313540242612,
  0.000581816304475069,
  0.0012564711505547166,
  0.00115439691580832,
  0.0007492142030969262,
  0.0006189707200974

In [None]:
len(encoded_minds['train'][0]['input_values'])

16000

In [21]:
encoded_minds = encoded_minds.rename_column("intent_class", "label")

In [22]:
encoded_minds

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 450
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 113
    })
})

In [23]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [24]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [25]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)



Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.bias', 'project_hid.bias', 'quantizer.codevectors', 'project_hid.weight', 'project_q.weight', 'project_q.bias', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'projector

In [None]:
model

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), strid

In [None]:
training_args = TrainingArguments(
    output_dir="my_awesome_mind_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

Cloning https://huggingface.co/Zahra99/my_awesome_mind_model into local empty directory.


In [None]:
trainer.train()

***** Running training *****
  Num examples = 450
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 4
  Total optimization steps = 30
  Number of trainable parameters = 94572174


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,2.641682,0.088496
1,No log,2.649028,0.035398
2,No log,2.652694,0.061947
3,3.227900,2.657254,0.053097
4,3.227900,2.656207,0.079646
5,3.227900,2.665526,0.061947
6,3.218800,2.663842,0.070796
7,3.218800,2.667537,0.061947
8,3.218800,2.668505,0.044248
9,3.214000,2.668246,0.053097


***** Running Evaluation *****
  Num examples = 113
  Batch size = 32
Saving model checkpoint to my_awesome_mind_model/checkpoint-3
Configuration saved in my_awesome_mind_model/checkpoint-3/config.json
Model weights saved in my_awesome_mind_model/checkpoint-3/pytorch_model.bin
Feature extractor saved in my_awesome_mind_model/checkpoint-3/preprocessor_config.json
Feature extractor saved in my_awesome_mind_model/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 113
  Batch size = 32
Saving model checkpoint to my_awesome_mind_model/checkpoint-6
Configuration saved in my_awesome_mind_model/checkpoint-6/config.json
Model weights saved in my_awesome_mind_model/checkpoint-6/pytorch_model.bin
Feature extractor saved in my_awesome_mind_model/checkpoint-6/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 113
  Batch size = 32
Saving model checkpoint to my_awesome_mind_model/checkpoint-9
Configuration saved in my_awesome_mind_model/checkpoint-9/config

TrainOutput(global_step=30, training_loss=3.220238367716471, metrics={'train_runtime': 293.3464, 'train_samples_per_second': 15.34, 'train_steps_per_second': 0.102, 'total_flos': 4.0255969873536e+16, 'train_loss': 3.220238367716471, 'epoch': 9.8})

In [27]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]



In [29]:
dataset

Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 563
})

In [None]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="Zahra99/my_awesome_mind_model")
classifier(audio_file)

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Zahra99--my_awesome_mind_model/snapshots/672f9d8f7b4cc5a6b576791395d68e67790f530c/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "Zahra99/my_awesome_mind_model",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/378M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--Zahra99--my_awesome_mind_model/snapshots/672f9d8f7b4cc5a6b576791395d68e67790f530c/pytorch_model.bin
All model checkpoint weights were used when initializing Wav2Vec2ForSequenceClassification.

All the weights of Wav2Vec2ForSequenceClassification were initialized from the model checkpoint at Zahra99/my_awesome_mind_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Wav2Vec2ForSequenceClassification for predictions without further training.


Downloading (…)rocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--Zahra99--my_awesome_mind_model/snapshots/672f9d8f7b4cc5a6b576791395d68e67790f530c/preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}



[{'score': 0.08433972299098969, 'label': 'cash_deposit'},
 {'score': 0.08129999786615372, 'label': 'app_error'},
 {'score': 0.07708557695150375, 'label': 'card_issues'},
 {'score': 0.07627283036708832, 'label': 'freeze'},
 {'score': 0.07423842698335648, 'label': 'balance'}]

You can also manually replicate the results of the pipeline if you’d like:

In [30]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("Zahra99/my_awesome_mind_model")
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [31]:
inputs

{'input_values': tensor([[ 0.0006,  0.0027,  0.0026,  ...,  0.0007,  0.0001, -0.0003]])}

In [33]:
import torch
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [34]:
logits

tensor([[-0.0564, -0.2195,  0.1142, -0.0431,  0.0569,  0.0474, -0.0533,  0.3383,
         -0.1348, -0.0647, -0.0299,  0.0953, -0.2141,  0.0689]])

In [35]:
predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

'cash_deposit'