## Cодержание:
* [Импорт библиотек](#first)
* [Загрузка и изучение данных](#second)
* [Базовый анализ данных](#third)
* [Предобработка данных](#fourth)
* [Обучение модели](#fifth)
* [Тестирование модели](#sixth)
* [Выводы](#seventh)

## Импорт библиотек <a class="anchor" id="first"></a>

In [277]:
# pip install noisereduce
# pip install librosa

In [9]:
import tarfile

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sys

import librosa
import librosa.display

from IPython.display import Audio
import noisereduce as nr

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder


import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)


RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Функция обучения

In [None]:
def train(model, optimizer, criterion, n_epochs, train_loader, test_loader):

  loss_train = []
  accuracy_train = []

  for epoch in range(n_epochs):
    model.train()
    for input, target in tqdm(train_loader, desc=f"Training epoch {epoch + 1}/{n_epochs}"):
        input, target = input.to(device), target.to(device)

        output = model(inputs)

        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
      for input, target in tqdm(test_loader, desc=f"Testing epoch {epoch + 1}/{n_epochs}"):
        input, target = input.to(device), target.to(device)
        output = model(inputs)
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
    accuracy_train.append(test_accuracy)
    print('Epoch [{}/{}], Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(epoch + 1, n_epochs, loss.item(), test_accuracy * 100))
    loss_train.append(loss.item())

## Загрузка и изучение данных <a class="anchor" id="second"></a>

In [298]:
# !wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/dusha/crowd.tar

Read files

In [299]:
def load_data(path, columns_to_drop=['hash_id', 'source_id']):
    df = pd.read_json(path, lines=True).drop(columns=columns_to_drop)
    df['audio_path'] = df['audio_path'].apply(lambda x: x.split('/')[1])
    df = df[df['annotator_emo'] != 'other']
    #df = df[df['duration'] <= 5.0]
    return df

In [300]:
tar_file_path = 'crowd.tar'

with tarfile.open(tar_file_path, 'r') as tar:
    tar.extractall()

print(f'Tar {tar_file_path} extracted')

Tar crowd.tar extracted


In [301]:
tar_file_path = 'podcast.tar'

with tarfile.open(tar_file_path, 'r') as tar:
    tar.extractall()

print(f'Tar {tar_file_path} extracted')

Tar podcast.tar extracted


In [302]:
crowd_train = load_data('crowd_train/raw_crowd_train.jsonl')
crowd_test = load_data('crowd_test/raw_crowd_test.jsonl')
podcast_train = load_data('podcast_train/raw_podcast_train.jsonl')
podcast_test = load_data('podcast_test/raw_podcast_test.jsonl')

## Подготовка данных для обучения модели

Предобработка делаем через torchaudio так как она быстрее и работает на gpu

In [337]:
class DushaDataset(Dataset):

    def __init__(self, df, audio_dir, transformation,
                target_sample_rate, num_samples, device):

        self.device = device

        self.df = df
        self.audio_dir = audio_dir
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples



    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)

        signal, sr = torchaudio.load(self.audio_dir + audio_sample_path)
        signal = signal.to(self.device)
        # signal (num_channels, samples) -> (2, 16000) -> (1, 16000)
        signal = self._mix_down_if_necessary(signal) # if have diffrent chanels
        signal = self._resample_if_necessary(signal,sr) # if diffrent freq
        signal = self._cut_if_neccessary(signal)
        signal = self._right_pad_if_neccessary(signal)
        signal = self.transformation(signal)
        return signal , label

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(self.device)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim = 0, keepdim = True)
        return signal

    def _cut_if_neccessary(self, signal):
        # signal -> Tensor -> (1, num_samples)
        if signal.shape[1] > self.num_samples:
            signal = signal[:,:self.num_samples]
        return signal

    def _right_pad_if_neccessary(self, signal):
        lenght_signal = signal.shape[1]
        if lenght_signal < self.num_samples:
            num_miising_samples = self.num_samples = lenght_signal
            last_dim_padding = (0, num_miising_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal


    def _get_audio_sample_path(self, index):
        path = self.df.iloc[index,0]
        return path

    def _get_audio_sample_label(self, index):
        label = self.df.iloc[index,2]
        return label

Распределние фурье, мелспектограмма те частоты которые понимает человек, потому что фурье может быть на частотах которые человек не понимает

In [338]:
NUM_SAMPLES = 80000 # 5 sec (5 * sample_rate)

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate = SAMPLE_RATE,
    n_fft = 1024,
    hop_length = 512,
    n_mels = 64)
audio_dir_train = "/Users/kirillanpilov/NLP_FU/Lab6/crowd_train/wavs/"
audio_dir_test = "/Users/kirillanpilov/NLP_FU/Lab6/crowd_test/wavs/"
dusha_dataset_train = DushaDataset(crowd_train, audio_dir_train, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device)
dusha_dataset_test = DushaDataset(crowd_test, audio_dir_test, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device)

In [339]:
signal, label = dusha_dataset_train[0]

In [340]:
signal.shape # (num_channels, n_mels, time_frames)

torch.Size([1, 64, 154])

In [341]:
label

0

Whisper

In [5]:
!pip install accelerate



In [19]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

def transcribe_uploaded_audio(change):
    # Получение содержимого загруженного файла
    file_contents = file_upload.data[-1]['content']
    audio_data, sample_rate = torchaudio.load(io.BytesIO(file_contents), normalize=True)

    # Воспроизведение аудио
    display(Audio(audio_data.numpy(), rate=sample_rate))

    # Распознавание речи
    result = pipe(audio_data.squeeze().numpy())
    transcription_text = result["text"]

    # Вывод результатов
    print("Transcription result:", transcription_text)

# Создание виджета для загрузки файла
file_upload = widgets.FileUpload(accept='.wav', description="Upload Audio File")
file_upload.observe(transcribe_uploaded_audio, names='data')

# Отображение виджета
display(file_upload)


FileUpload(value=(), accept='.wav', description='Upload Audio File')

## 5. Примените ruBERT для анализа тональности текста. Если не хватает вычислительных ресурсов для работы с датасетом Dusha, то можно использовать датасет, в котором объединены датасеты SAVEE и TESS)

Предобученная модель 

In [None]:
def load_model_hubert():
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        "facebook/hubert-large-ls960-ft")
    model = HubertForSequenceClassification.from_pretrained(
        "xbgoose/hubert-speech-emotion-recognition-russian-dusha-finetuned")
    return model, feature_extractor

In [None]:
model_hubert, processor_hubert = load_model_hubert()

In [None]:
num2emotion = {0: 'neutral', 1: 'angry', 2: 'positive', 3: 'sad', 4: 'other'}
    inputs = processor_hubert(
        audio,
        sampling_rate=processor_hubert.sampling_rate,
        return_tensors="pt",
        padding=True,
        max_length=16000 * 10,
        truncation=True
    )
    logits = model_hubert(inputs['input_values'][0]).logits
    predictions = torch.argmax(logits, dim=-1)
    predicted_emotion = num2emotion[predictions.numpy()[0]]
    return predicted_emotion