# DataLoader

In [4]:
from datasets import load_dataset, Audio, Features, Value, ClassLabel, load_metric

ds = load_dataset('csv', data_files='/mnt/pci-0000:00:1f.2-ata-1-part1/ZLQ/AI/data/TAL-SER/talser_data.csv',delimiter=';',split='train')
ds = ds.cast_column("audio", Audio())
ds = ds.train_test_split(test_size=0.2, shuffle=True)
ds = ds.remove_columns(["sex", "id", "speaker",'PA'])
class_names = ["积极高唤醒", "积极低唤醒", "消极高唤醒", "消极低唤醒"]
emotion_features = Features({'emotion': ClassLabel(names=class_names)})
ds = ds.cast_column("emotion",  ClassLabel(names=class_names))
ds=ds.rename_column("P", "LABEL_0")
ds=ds.rename_column("A", "LABEL_1")

ds

Using custom data configuration default-b28597fbba130594
Reusing dataset csv (/home/zlq/.cache/huggingface/datasets/csv/default-b28597fbba130594/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['LABEL_0', 'LABEL_1', 'audio', 'emotion'],
        num_rows: 3350
    })
    test: Dataset({
        features: ['LABEL_0', 'LABEL_1', 'audio', 'emotion'],
        num_rows: 838
    })
})

In [3]:
import numpy as np
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)


class RegressionHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config):

        super().__init__()
        print(config.num_labels)
        self.config = config
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class EmotionModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = RegressionHead(config)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):
        
        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)

        return hidden_states, logits



# load model from hub
device = 'cuda'
model_name = 'facebook/wav2vec2-base'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model_ft = EmotionModel.from_pretrained(model_name)

# 法1：冻结卷积层
for param in model_ft.parameters():
	param.requires_grad = False
print("conv1.weights[0, 0, ...]".format(model_ft.conv1.weight[0, 0, ...]))

# 3/3 替换fc层(以适应新任务)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(in_features=num_ftrs, out_features=2)


# dummy signal
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)


# define loss function (criterion), optimizer, and learning rate scheduler
criterion = nn.CrossEntropyLoss(ignore_index=-1).cuda(args.gpu)

optimizer = torch.optim.SGD(model.parameters(), args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)

"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

def process_func(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict emotions or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model_ft(y)[0 if embeddings else 1]

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y

2


Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing EmotionModel: ['project_q.weight', 'quantizer.codevectors', 'project_hid.weight', 'project_q.bias', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_hid.bias']
- This IS expected if you are initializing EmotionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EmotionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EmotionModel were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAI

AttributeError: 'EmotionModel' object has no attribute 'conv1'

In [None]:
process_func(signal, sampling_rate)
#  Arousa    dominance valence
# [[0.5460759 0.6062269 0.4043165]]

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
process_func(signal, sampling_rate, embeddings=True)
# Pooled hidden states of last transformer layer
# [[-0.00752167  0.0065819  -0.00746339 ...  0.00663631  0.00848747
#   0.00599209]]

In [27]:
# ## 预处理
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base",return_attention_mask=True)
feature_extractor

# preprocess_function
# - 调用要加载的音频列，并在必要时重新采样音频文件。
# - 检查音频文件的采样率是否与模型预先训练的音频数据的采样率相匹配。您可以在 Wav2Vec2模型卡上找到这些信息。![the Wav2Vec2 model card](https://huggingface.co/docs/transformers/tasks/(https://huggingface.co/facebook/wav2vec2-base).
# - 设置一个最大输入长度，这样较长的输入不会被截断

metric = load_metric("accuracy")
metric
max_duration = 10
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = processor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=int(feature_extractor.sampling_rate * max_duration), truncation=True
    )
    return inputs
    
# 使用 Datasets map函数对整个数据集应用预处理函数。您可以通过设置 batching = True 来加速 map 函数，以便一次处理数据集的多个元素。移除您不需要的列，并重命名 intent_ class 以标记，因为这正是模型所期望的:
encoded_dataset = ds.map(preprocess_function, remove_columns=["audio","emotion","PA"], batched=True)
encoded_dataset

## 训练
# 使用 AutoModelForAudio 分类加载 wave 2Vec2。指定标签的数量，并将标签数量和标签类之间的映射传递给模型:
# from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

# num_labels = len(id2label)
# model = AutoModelForAudioClassification.from_pretrained(
#     "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
# )




ValueError: Column to remove ['P', 'A', 'PA'] not in the dataset. Current columns in the dataset: ['LABEL_0', 'LABEL_1', 'audio', 'emotion']

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)


# 在 TrainingArguments 中定义训练超参数。
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned-ch-emotion-edu",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=5,
    # prediction_loss_only =False,
    warmup_ratio=0.1,
    max_grad_norm = 1,# 梯度裁剪功能，控制梯度的最大值，避免过大的梯度给权重带来过大的变化从而使得模型变得不稳定。
    lr_scheduler_type = 'linear',
    logging_strategy = 'steps',
    logging_steps=10,
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    hub_token = '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',
    push_to_hub=True,
)

# 将训练参数与模型、数据集和特征提取器一起传递给 Trainer。
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()