# Installs

In [None]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa

!pip install transformers[torch]

# Import

In [1]:
import re
import io
import os
from os import walk

from datetime import datetime

import numpy as np
import pandas as pd

import json

import math

# from sklearn.pipeline import make_pipeline, Pipeline
# from sklearn.model_selection import StratifiedKFold
# from sklearn.utils import shuffle

from itertools import islice

In [2]:
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

import torchaudio
from torchaudio.transforms import Resample

import librosa

from transformers import pipeline
from transformers import AutoConfig, Wav2Vec2Processor
from transformers.file_utils import ModelOutput

from dataclasses import dataclass
from typing import Optional, Tuple

# Data

In [26]:
data = pd.read_csv(folderAPath+'Attachment Audio 16hz Data/attachment_audio_v5.csv', header=0)

# Model

In [7]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# Extraction

## Model

In [8]:
model_name_or_path = folderEPath+'wav2vec2-xlsr-speech-emotion-recognition/r3-checkpoint-720'
config = AutoConfig.from_pretrained(model_name_or_path)
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path, output_hidden_states=True)#, output_attentions=True)#.to(device)

Some weights of the model checkpoint at wav2vec2-xlsr-speech-emotion-recognition/r3-checkpoint-720 were not used when initializing Wav2Vec2ForSpeechClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at wav2vec2-xlsr-speech-emotion-recognition/r3-checkpoint-720 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.

In [9]:
# Define a function to process the text and get the classes and the output of the last hidden layer
def get_logits_and_last_hidden_layer(input_values, attention_mask):
    # Forward pass through the W2V2 model
    outputs = model(input_values, attention_mask=attention_mask)

    # Get the logits from the output
    logits = outputs.logits

    # Get the output of the last hidden layer
    last_hidden_layer = outputs.hidden_states[-1]

    return logits, last_hidden_layer

In [10]:
def logits2LabelsDF(logits):
    scores = F.softmax(logits, dim=1).detach().numpy()[0]
    labels = [{'label': model.config.id2label[i], 'score': score} for i, score in enumerate(scores)]

    labelsDF = pd.DataFrame()
    for label in labels:
        labelsDF[label['label']] = [label['score']]

    return labelsDF

In [11]:
def tensor2JSON(last_hidden_layer):
    # Convert the PyTorch tensor to a NumPy array
    last_hidden_layer_np = last_hidden_layer.detach().numpy()

    # Convert the NumPy array to a Python list
    last_hidden_layer_list = last_hidden_layer_np.tolist()

    # Serialize the Python list to JSON
    last_hidden_layer_json = json.dumps(last_hidden_layer_list)

    return last_hidden_layer_json

In [12]:
def tensor2Numpy(last_hidden_layer):
    # Convert the PyTorch tensor to a NumPy array
    last_hidden_layer_np = last_hidden_layer.detach().numpy()

    return last_hidden_layer_np

In [13]:
def poolHiddenStates(hidden_states, mode="mean"):
    if mode == "mean":
        outputs = torch.mean(hidden_states, dim=1)
    elif mode == "sum":
        outputs = torch.sum(hidden_states, dim=1)
    elif mode == "max":
        outputs = torch.max(hidden_states, dim=1)[0]
    else:
        raise Exception(
            "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

    return outputs
 

In [14]:
def poolOutput(hiddenOutputs):
    pooledOutput = torch.cat(hiddenOutputs, dim=1)

    return pooledOutput

In [15]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

## Extraction

In [45]:
# Use pre-set durations 

sampling_rate = 16000

threshold = 0.1 # 1 ms

sInd = 0
currID, currTask, segment = data.iloc[0].ID, data.iloc[0].Task, 0
subStoryHiddenOutputs = []
    
for index, row in tqdm(islice(data.iterrows(),sInd,None), desc=" row", total=data[sInd:].shape[0], leave=True, position=0):
    childDF = row.copy().to_frame().T.reset_index(drop=True)
    auPath = folderAPath+childDF.Path.values[0]

    if (childDF.ID[0]==currID and childDF.Task[0]==currTask):
        segment += 1 
    else: 
        if len(subStoryHiddenOutputs) > 0:
            pooled_h_output = poolOutput(subStoryHiddenOutputs) 
            np.savez_compressed('Data/Audio/Merged/{}/{}.npy'.format(currTask, currID), tensor2Numpy(pooled_h_output))
        
        segment = 1
        subStoryHiddenOutputs = []
        currID, currTask = childDF.ID[0], childDF.Task[0]

    # If the length of the audio is less than what the model can process, skip and record the information.
    if (childDF.End[0] - childDF.Start[0]) < threshold:
        skippedDF = childDF.copy()
        skippedDF['skipped_duration'] = childDF.End[0] - childDF.Start[0]

        with open('Data/Audio/attachment_emotions_audio_skipped_segments_v1.csv', 'a') as f:
          skippedDF.to_csv(f, mode='a', index=False, header=f.tell()==0)

    else:        
        speech, _ = librosa.load(auPath, offset=childDF.Start[0], duration=childDF.End[0] - childDF.Start[0], sr=sampling_rate, mono=True) 
        features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    
        input_values = features.input_values#.to(device)
        attention_mask = features.attention_mask#.to(device)
    
        with torch.no_grad():
            storyResults, h_output = get_logits_and_last_hidden_layer(input_values, attention_mask)
            storyResultsDF = logits2LabelsDF(storyResults)
    
        subStoryHiddenOutputs.append(h_output)
    
        segDF = pd.concat([childDF, storyResultsDF], axis=1)
        segDF['time'] = childDF.Start[0]
        segDF['segment'] = segment
     
        np.savez_compressed('Data/Audio/Unmerged/{}/{}_{}.npy'.format(row.Task, row.ID, segment), tensor2Numpy(h_output))
    
        pooled_mean_output = poolHiddenStates(h_output) # the same way as the model that produced the labels
        np.savez_compressed('Data/Audio/Pooled/{}/{}_{}.npy'.format(row.Task, row.ID, segment), tensor2Numpy(pooled_mean_output))
    
        with open('Data/Audio/attachment_emotions_audio_v1.csv', 'a') as f:
          segDF.to_csv(f, mode='a', index=False, header=f.tell()==0)


 row:   0%|          | 0/1169 [00:00<?, ?it/s]