In [1]:
# Restart kernel to avoid issues, necessary before running next lines

# When using jupyter lab
# import os
# os._exit(00)

# When using jupyter notebook
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# transformers finetune
- 预训练模型 wav2vec
- 数据集 TAL-SER


## 加载数据集

In [2]:
from datasets import load_dataset, Audio, Features, Value, ClassLabel, load_metric

ds = load_dataset('csv', data_files='/mnt/pci-0000:00:1f.2-ata-1-part1/ZLQ/AI/data/TAL-SER/talser_data.csv',delimiter=';',split='train')
ds = ds.cast_column("audio", Audio())
ds = ds.train_test_split(test_size=0.2, shuffle=True)
ds = ds.remove_columns(["sex", "id", "speaker"])
ds

2022-08-03 09:49:04.179098: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Using custom data configuration default-b28597fbba130594
Reusing dataset csv (/home/zlq/.cache/huggingface/datasets/csv/default-b28597fbba130594/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


DatasetDict({
    train: Dataset({
        features: ['P', 'A', 'audio', 'emotion', 'PA'],
        num_rows: 3350
    })
    test: Dataset({
        features: ['P', 'A', 'audio', 'emotion', 'PA'],
        num_rows: 838
    })
})

In [3]:
metric = load_metric("accuracy")
metric

Metric(name: "accuracy", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = datasets.load_metric("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
        {'accuracy': 0.5}

   

In [4]:
class_names = ["积极高唤醒", "积极低唤醒", "消极高唤醒", "消极低唤醒"]
emotion_features = Features({'emotion': ClassLabel(names=class_names)})
ds = ds.cast_column("emotion",  ClassLabel(names=class_names))
ds

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['P', 'A', 'audio', 'emotion', 'PA'],
        num_rows: 3350
    })
    test: Dataset({
        features: ['P', 'A', 'audio', 'emotion', 'PA'],
        num_rows: 838
    })
})

In [5]:
labels = ds["train"].features["emotion"]
labels

ClassLabel(num_classes=4, names=['积极高唤醒', '积极低唤醒', '消极高唤醒', '消极低唤醒'], id=None)

In [6]:
labels = ds["train"].features["emotion"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
    
id2label[str(2)]

'消极高唤醒'

### 随机查看五条音频

In [7]:
import random
from IPython.display import Audio, display

for _ in range(5):
    rand_idx = random.randint(0, len(ds["train"])-1)
    example = ds["train"][rand_idx]
    audio = example["audio"]

    print(f'Label: {id2label[str(example["emotion"])]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

Label: 积极高唤醒
Shape: (160000,), sampling rate: 16000



Label: 消极高唤醒
Shape: (160000,), sampling rate: 16000



Label: 消极低唤醒
Shape: (160000,), sampling rate: 16000



Label: 积极高唤醒
Shape: (160000,), sampling rate: 16000



Label: 消极低唤醒
Shape: (160000,), sampling rate: 16000





## 预处理

In [8]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base",return_attention_mask=True)
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

preprocess_function
- 调用要加载的音频列，并在必要时重新采样音频文件。
- 检查音频文件的采样率是否与模型预先训练的音频数据的采样率相匹配。您可以在 Wav2Vec2模型卡上找到这些信息。![the Wav2Vec2 model card](https://huggingface.co/docs/transformers/tasks/(https://huggingface.co/facebook/wav2vec2-base).
   
- 设置一个最大输入长度，这样较长的输入不会被截断

In [9]:
max_duration = 10
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=int(feature_extractor.sampling_rate * max_duration), truncation=True
    )
    return inputs

使用 Datasets map函数对整个数据集应用预处理函数。您可以通过设置 batching = True 来加速 map 函数，以便一次处理数据集的多个元素。移除您不需要的列，并重命名 intent_ class 以标记，因为这正是模型所期望的:

In [10]:
encoded_dataset = ds.map(preprocess_function, remove_columns=["audio","P","A","PA"], batched=True)
encoded_dataset = encoded_dataset.rename_column("emotion", "label")
encoded_dataset



  0%|          | 0/4 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 3350
    })
    test: Dataset({
        features: ['label', 'input_values', 'attention_mask'],
        num_rows: 838
    })
})

## 训练

In [11]:
# 使用 AutoModelForAudio 分类加载 wave 2Vec2。指定标签的数量，并将标签数量和标签类之间的映射传递给模型:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'projector.weight', 'projecto

In [12]:
import numpy as np

# def compute_metrics(eval_pred):
#     """Computes accuracy on a batch of predictions"""
#     predictions = np.argmax(eval_pred.predictions, axis=1)
#     return metric.compute(predictions=predictions, references=eval_pred.label_ids)

def compute_metrics(p):
    preds,labels=p
    preds = np.argmax(preds, axis=-1)
    #print('shape:', preds.shape, '\n')
    precision, recall, f1, _ = precision_recall_fscore_support(lables.flatten(), preds.flatten(), average='weighted', zero_division=0)
    return {
        'accuracy': (preds == p.label_ids).mean(),
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 在 TrainingArguments 中定义训练超参数。
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned-ch-emotion-edu",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=50,
    # prediction_loss_only =False,
#     warmup_ratio=0.1,
#     max_grad_norm = 1,# 梯度裁剪功能，控制梯度的最大值，避免过大的梯度给权重带来过大的变化从而使得模型变得不稳定。
#     lr_scheduler_type = 'linear',
#     logging_strategy = 'steps',
#     logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    hub_token = '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',
    push_to_hub=True,
)

# 将训练参数与模型、数据集和特征提取器一起传递给 Trainer。
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer.train()

/mnt/pci-0000:00:1f.2-ata-1-part1/ZLQ/AI/SER-transformers/./wav2vec2-finetuned-ch-emotion-edu is already a clone of https://huggingface.co/cotcode/wav2vec2-finetuned-ch-emotion-edu. Make sure you pull the latest changes with `repo.git_pull()`.
***** Running training *****
  Num examples = 3350
  Num Epochs = 50
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 5250


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

In [None]:
import numpy as np

# Tokenize test set
dataset_test_encoded = ds["test"].map(preprocess_function, batched=True)
# Use the model to get predictions
test_predictions = trainer.predict(dataset_test_encoded)
# For each prediction, create the label with argmax
test_predictions_argmax = np.argmax(test_predictions[0], axis=1)
# Retrieve reference labels from test set
test_references = np.array(ds["test"]["label"])
# Compute accuracy
metric.compute(predictions=test_predictions_argmax, references=test_references)
# {'accuracy': 0.91888}

In [None]:
trainer.push_to_hub()