In [47]:
%%capture 
!pip install torch torchaudio transformers

python(26979) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [15]:
import sys
import os
import pickle

sys.path.append(f"{os.getcwd()}/../")
from src.dataload import get_annotated_data

MSP_PATH = '/Users/beltre.wilton/Downloads/SER-Datasets/MSP-Conversation-1.1'
WADF_PATH = f"../data/wadf.pkl"
df_annotations, df_reduced = get_annotated_data()

from typing import List
import pandas as pd
import numpy as np
from pathlib import Path
import math
import torch
import torchaudio
from torch.utils.data import Dataset
from IPython.display import display, Audio

wadf = {}
try:
    with open(WADF_PATH, "rb") as pkl:
        wadf = pickle.load(pkl)
except Exception as ex:
    print(ex)


class MSPDataset(Dataset):
    def __init__(self, msp_path: str, wadf: dict, chunk_size: int, df_reference: pd.DataFrame, split: str,
                 filter_emotion_by: str = 'Arousal', verbose: bool = False, device: str = "mps", ):
        self.msp_path = msp_path
        self.wadf = wadf
        self.chunk_size = chunk_size
        self.df_reference = df_reference[df_reference.Type == split]
        self.split = split
        self.filter_emotion_by = filter_emotion_by
        self.device = device
        self.verbose = verbose
        self.SAMPLE_RATE = 16_000
        self.TEMPERATURE_DATAPOINT = .5
        self.__filter_df()
        self.input_features = self.__prepare_inputs()
        del self.wadf
        del self.df_reference
        

    def __filter_df(self):
        keys = [f"{pc_num}_{part_num}_{self.filter_emotion_by}" for pc_num, part_num in
                zip(self.df_reference['PC_Num'], self.df_reference['Part_Num'])]
        self.wadf = {key: self.wadf[key] for key in keys}
        

    def __get_wave_segment(self, waveform, key: str):
        key = key.split('_')
        pc_num = int(key[0])
        part_num = int(key[1])
        ref = self.df_reference[(self.df_reference['PC_Num'] == pc_num) & (self.df_reference['Part_Num'] == part_num)]
        start_time = math.ceil(ref.start_time.iloc[0] * self.SAMPLE_RATE)  # ceil workaround
        end_time = math.ceil(ref.end_time.iloc[0] * self.SAMPLE_RATE)  # ceil workaround
        from_shape = waveform.shape
        waveform = waveform[start_time:end_time]
        return waveform, ref.start_time.iloc[0], ref.end_time.iloc[0]
        

    def __check_for_resample(self, waveform, sample_rate: int, aud: str):
        # Si el sample rate no es 16000 entonces resample
        if sample_rate != self.SAMPLE_RATE:
            waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=self.SAMPLE_RATE)
            if self.verbose:
                print(f'\t{aud} Resampleando!')
        # Si es estereo (2 canales) pues convierte a mono-canal
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0).unsqueeze(0)
            if self.verbose:
                print(f'\t{aud} Bajando a mono!')
        return waveform
        

    def __prepare_audio_inputs(self, chunked_data_points: dict):
        audio_path = Path(f'{self.msp_path}/Audio')
        waves = {}
        if self.verbose:
            print(f'Inicio de lectura de audios wav y segmentacion cada {self.chunk_size} segundos...')
        for idx, key in enumerate(chunked_data_points.keys()):
            aud = f"MSP-Conversation_{key.split('_')[0].rjust(4, '0')}.wav"
            aud_path = audio_path / aud
            waveform, sample_rate = torchaudio.load(str(aud_path), normalize=True)
            waveform = self.__check_for_resample(waveform, sample_rate, aud)
            waveform = waveform.squeeze()  # [c, s] => [s]
            waveform, start_time, end_time = self.__get_wave_segment(waveform, key)
            upto = math.ceil((waveform.size(0) / self.SAMPLE_RATE))
            chunked_parts = []
            for i in range(0, upto, self.chunk_size):
                start = i * self.SAMPLE_RATE
                end = (i + self.chunk_size) * self.SAMPLE_RATE
                wave_chunked = waveform[start:end]
                chunked_parts.append(wave_chunked)
            waves[key] = chunked_parts

            if self.verbose:
                print(
                    f"  ({idx + 1}) {aud}; {key}[{round(start_time, 2)}:{round(end_time, 2)}] {len(chunked_parts)} segmentos")

        return waves
        

    def __chunk_datapoints(self, key: str, ):
        a = np.array([])
        wak = self.wadf[key]
        data_points = []
        l = self.wadf[key].iloc[-1]['Time']
        for i in range(0, int(l), self.chunk_size):
            wa = wak[(wak['Time'] > i) & (wak['Time'] < i + self.chunk_size)]
            a = np.append(a, len(wa))
            data_points.append(np.nan_to_num(wa['Annotation'].to_numpy()))
        wa = wak[(wak['Time'] > i + self.chunk_size)]
        if len(wa) > 1:
            a = np.append(a, len(wa))
            data_points.append(np.nan_to_num(wa['Annotation'].to_numpy()))
        return [key, a.min(), a.mean(), a.max()], data_points
        

    ## Ejercicio de cortar las partes-de-audio cada `chunk_size` segundos
    def __prepare_datapoints(self):
        data = []
        chunked_data_points = {}
        for key in self.wadf.keys():
            a, data_points = self.__chunk_datapoints(key)
            data.append(a)
            chunked_data_points[key] = data_points
        wa_data_points = pd.DataFrame(data, columns=['Key', 'Min', 'Mean', 'Max'])

        mu = wa_data_points.Mean.mean() * self.TEMPERATURE_DATAPOINT
        wa_data_points = wa_data_points[(wa_data_points.Min >= int(mu))]
        wa_data_points = wa_data_points[wa_data_points['Key'].str.contains(self.filter_emotion_by)]
        if self.verbose:
            print(f'Cantidad de partes antes de filtro: {len(chunked_data_points)}')
        chunked_data_points = {key: chunked_data_points[key] for key in wa_data_points['Key']}
        if self.verbose:
            print(f'Cantidad de partes despues de filtro: {len(chunked_data_points)}\n')

        del data
        del wa_data_points

        if self.verbose:
            print(f'Segmentos de Datapoints completado.\n')

        return chunked_data_points
        

    def __len__(self):
        return len(self.input_features['inputs'])
        

    def __getitem__(self, idx):
        x = self.input_features['inputs'][idx]
        label = self.input_features['labels'][idx]
        return x, torch.tensor(label, dtype=torch.float32) # x is also float32 by default
        

    def __prepare_inputs(self, ):
        if len(self.df_reference) == 0:
            raise Exception(f'Parece que el split: {self.split} es incorrecto, intenta con Train, Test o Development')

        chunked_data_points = self.__prepare_datapoints()
        waves = self.__prepare_audio_inputs(chunked_data_points)

        if self.verbose:
            ok = True
            for w in waves.keys():
                if len(waves[w]) != len(chunked_data_points[w]):
                    print(f'\n*** FAIL: Inputs y Labels waves:{len(waves[w])} datapoints:{len(chunked_data_points[w])}')
                    ok = False
            if ok:
                print('\nInputs y Labels se corresponden en catidad de segmentos.\n')

        inputs = []
        labels = []
        for key in chunked_data_points.keys():
            for wv, dp in zip(waves[key], chunked_data_points[key]):
                inputs.append(wv)
                labels.append(dp)
        input_features = {'inputs': inputs, 'labels': labels}
        if self.verbose:
            print(
                f"input_features listos, {len(input_features['inputs'])} inputs-tensores y {len(input_features['labels'])} labels-np.arrays\n")

        return input_features


msp = MSPDataset(msp_path=MSP_PATH, wadf=wadf, df_reference=df_reduced, split="Development", chunk_size=15,
                 verbose=True,
                 device="mps")


Cantidad de partes antes de filtro: 104
Cantidad de partes despues de filtro: 51

Segmentos de Datapoints completado.

Inicio de lectura de audios wav y segmentacion cada 15 segundos...
  (1) MSP-Conversation_0002.wav; 2_1_Arousal[0.0:284.02] 19 segmentos
  (2) MSP-Conversation_0002.wav; 2_2_Arousal[284.02:581.34] 20 segmentos
  (3) MSP-Conversation_0002.wav; 2_3_Arousal[581.34:865.3] 19 segmentos
  (4) MSP-Conversation_0002.wav; 2_4_Arousal[865.3:1130.48] 18 segmentos
  (5) MSP-Conversation_0157.wav; 157_1_Arousal[0.0:310.11] 21 segmentos
  (6) MSP-Conversation_0157.wav; 157_3_Arousal[616.17:985.71] 25 segmentos
  (7) MSP-Conversation_0235.wav; 235_1_Arousal[0.0:268.53] 18 segmentos
  (8) MSP-Conversation_0235.wav; 235_3_Arousal[538.74:808.49] 18 segmentos
  (9) MSP-Conversation_0251.wav; 251_3_Arousal[561.47:889.01] 22 segmentos
  (10) MSP-Conversation_0260.wav; 260_2_Arousal[300.32:599.24] 20 segmentos
  (11) MSP-Conversation_0260.wav; 260_3_Arousal[599.24:851.34] 17 segmentos
  (12

In [2]:
Audio(msp.input_features['inputs'][22].numpy(), rate=msp.SAMPLE_RATE)

In [45]:
msp.input_features['inputs'][:20]

[tensor([-0.0027, -0.0035, -0.0034,  ...,  0.0005, -0.0012,  0.0028]),
 tensor([-0.0033,  0.0036, -0.0038,  ...,  0.0188, -0.0479,  0.0308]),
 tensor([-0.0075, -0.0063,  0.0172,  ..., -0.0330, -0.0310, -0.0279]),
 tensor([-0.0320, -0.0357, -0.0385,  ...,  0.0007, -0.0092, -0.0106]),
 tensor([-0.0104, -0.0070,  0.0093,  ..., -0.1473, -0.1556, -0.2073]),
 tensor([-0.2551, -0.2454, -0.1812,  ..., -0.1209, -0.1106, -0.1007]),
 tensor([-0.0995, -0.0723, -0.0218,  ...,  0.0021,  0.0000, -0.0046]),
 tensor([-7.1716e-03, -2.1973e-03,  1.3428e-03,  ...,  9.1553e-05,
          5.4932e-04,  1.1292e-03]),
 tensor([ 0.0012,  0.0008,  0.0010,  ..., -0.0020, -0.0054, -0.0247]),
 tensor([-0.0310, -0.0348, -0.0479,  ...,  0.1831,  0.2089,  0.1917]),
 tensor([ 0.2054,  0.2518,  0.2179,  ..., -0.0175, -0.0174, -0.0175]),
 tensor([-0.0122, -0.0063,  0.0033,  ..., -0.0013,  0.0041,  0.0031]),
 tensor([-0.0020, -0.0021,  0.0006,  ...,  0.0545,  0.0419,  0.0304]),
 tensor([0.0339, 0.0292, 0.0189,  ..., 0.051

In [23]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2ForPreTraining

%env HF_HOME=/Users/beltre.wilton/apps/mspconv_ftlab/cache
%env HF_DATASETS_CACHE=/Users/beltre.wilton/apps/mspconv_ftlab/cache

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

env: HF_HOME=/Users/beltre.wilton/apps/mspconv_ftlab/cache
env: HF_DATASETS_CACHE=/Users/beltre.wilton/apps/mspconv_ftlab/cache


In [91]:
someinputs = processor(msp.input_features['inputs'][22], return_tensors="pt", sampling_rate=msp.SAMPLE_RATE).input_values
with torch.no_grad():
    x = processor.feature_extractor(someinputs, return_tensors="pt", sampling_rate=msp.SAMPLE_RATE).input_values
x

tensor([[[-1.0708, -1.1664, -1.2289,  ..., -0.3491, -0.1030,  0.0674]]])

In [94]:
msp.input_features['inputs'][22].unsqueeze(dim=0)

tensor([[-0.0936, -0.1020, -0.1075,  ..., -0.0305, -0.0090,  0.0059]])

In [92]:
wav2vec2 = Wav2Vec2ForPreTraining.from_pretrained(
                "facebook/wav2vec2-base-960h", output_hidden_states=True).wav2vec2


Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['project_hid.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
with torch.no_grad():
    x = wav2vec2.feature_extractor(msp.input_features['inputs'][22].unsqueeze(dim=0))

x.transpose(1, 2).shape

torch.Size([1, 749, 512])

In [24]:
torch.tensor(msp.input_features['labels'][0], dtype=torch.float32)

tensor([-4.0759e-03, -4.0759e-03, -4.0759e-03,  ...,  3.1933e+01,
         3.1933e+01,  3.1933e+01])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from train.wav2vec2_wrapper import Wav2vec2Wrapper
from transformers import Wav2Vec2ForPreTraining
from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices


class RNNHead(nn.Model):
    def __init__(self, n_classes: int = 1):
        super().__init__()
        feature_dim = 768 # base 768,  large 1024
        """
        In this paper, we explore methods for fine-tuning wav2vec
        2.0 on SER. We show that by adding a simple neural network 
        self.linear_head on top of wav2vec 2.0, vanilla fine-tuning (V-FT) 
        outperforms state-of-the-art (SOTA)
        """
        self.rnn_head = nn.LSTM(feature_dim, 256, 1, bidirectional=True)  #TODO: never used !!!!
        #TODO: taste with GRU ""modern version"  --->  self.rnn_head = nn.GRU(feature_dim, 256, 1, bidirectional=True)
        # self.rnn_head = nn.LSTM(feature_dim, 512, 1, bidirectional=True)
        self.linear_head = nn.Sequential(
            nn.ReLU(),
            nn.Linear(feature_dim, n_classes) #TODO: n_classes = 1 for regression task
        )

    def trainable_params(self):
        return list(self.rnn_head.parameters()) + list(self.linear_head.parameters()) + list(self.wav2vec2.trainable_params())

    def forward(self, x, length):
        reps, fe = self.wav2vec2(x, length)
        last_feat_pos = self.wav2vec2.get_feat_extract_output_lengths(length) - 1
        hidden_reps = reps  ## [Wilton]
        logits = reps.permute(1, 0, 2) #L, B, C
        masks = torch.arange(logits.size(0), device=logits.device).expand(last_feat_pos.size(0), -1) < last_feat_pos.unsqueeze(1)
        masks = masks.float()
        # Whats special here? from [166, 1, 768] ==to==>  [1, 768]
        # mask: hay una relacion length para conocer y multiplicar los logits por zero cuando el audio no es el mas grande.
        logits = (logits * masks.T.unsqueeze(-1)).sum(0) / last_feat_pos.unsqueeze(1)
        xlogits  = ((logits * masks.T.unsqueeze(-1)).permute(1, 0, 2).sum(2) / last_feat_pos.unsqueeze(1))

        # xlogits = (fe.permute(2, 0, 1) * masks.T.unsqueeze(-1)).permute(1, 0, 2).mean(2)

        logits = self.linear_head(logits)
        return logits, xlogits



class Wav2vec2Wrapper(nn.Module):
    def __init__(self, pretrain=True, wav2vec2, chunk_size):
        super().__init__()
        self.wav2vec2 = wav2vec2
        self.rnn_head = RNNHead()
        #TODO: Disable gradient checkpointing for ddp
        self.wav2vec2.encoder.config.gradient_checkpointing = False
        self.pretrain = pretrain
        if pretrain:
            self.mask_time_length = 10 # [Wilton] was 15
            self.mask_time_prob = 0.06 #Probability of each time step is masked!
            self.observe_time_prob = 0.0 #Percentage of tokens that are perserved
            self.mask_feature_prob = 0
            self.mask_feature_length = 64
        else:
            #SpecAug
            self.mask_time_length = 10 # [Wilton] was 15
            self.mask_time_prob = 0.08
            self.observe_time_prob = 0.0
            self.mask_feature_length = 64
            self.mask_feature_prob = 0.05

    def prepare_mask(self, length, shape, dtype, device):
        # Modified from huggingface
        mask = torch.zeros(
            shape, dtype=dtype, device=device
        )
        # these two operations makes sure that all values
        # before the output lengths indices are attended to
        mask[
            (torch.arange(mask.shape[0], device=device), length - 1)
        ] = 1
        mask = mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        return mask

    def trainable_params(self):
        ret = list(self.wav2vec2.encoder.parameters())
        return ret

    def __forward_wrapper(self, x, length=None):
        # [Wilton] it adapted from:
        # https://github.com/huggingface/transformers/blob/v4.30.0/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1510
        with torch.no_grad(): ## [Wilton] partial Fine-tuning
            x = self.wav2vec2.feature_extractor(x)
            fe = x
            x = x.transpose(1, 2) #New version of huggingface
            x, a = self.wav2vec2.feature_projection(x) #New version of huggingface
            mask = None
            if length is not None:
                length = self.get_feat_extract_output_lengths(length)
                mask = self.prepare_mask(length, x.shape[:2], x.dtype, x.device)
            if self.pretrain or self.training:
                batch_size, sequence_length, hidden_size = x.size()

                # [Wilton] from paper:
                # Wav2vec 2.0 differs from its NLP
                # counterparts [7] in that there is no utterance-level pretraining
                # task to naturally form a sentence representation. As a consequence, aggregation across time steps is required to fine-tune
                # on utterance level classification tasks.
                #
                # In addition, a modified version of SpecAugment [22] proposed in
                # wav2vec 2.0 is applied during training for better generalization
                #
                # apply SpecAugment along time axis VS. original: along feature axis [Wilton]
                if self.mask_time_prob > 0:
                    mask_time_indices = _compute_mask_indices(
                        (batch_size, sequence_length),
                        self.mask_time_prob,
                        self.mask_time_length,
                        min_masks=2,
                        # device=x.device #TODO: porque no??
                    )

                    mask_time_indices = torch.from_numpy(mask_time_indices).to(x.device) # [Wilton] fix to new torch and numpy versions.
                    masked_indicies = mask_time_indices & mask
                    flip_mask = torch.rand((batch_size, sequence_length), device=x.device) > self.observe_time_prob
                    x[masked_indicies & flip_mask] = self.wav2vec2.masked_spec_embed.to(x.dtype)

                # apply SpecAugment along feature axis
                if self.mask_feature_prob > 0:
                    mask_feature_indices = _compute_mask_indices(
                        (batch_size, hidden_size),
                        self.mask_feature_prob,
                        self.mask_feature_length,
                        # device=x.device, #TODO: porque no??
                        min_masks=1
                    )
                    mask_feature_indices = torch.from_numpy(mask_feature_indices).to(x.device)
                    x[mask_feature_indices[:, None].expand(-1, sequence_length, -1)] = 0
        x = self.wav2vec2.encoder(x, attention_mask=mask)[0]
        reps = F.relu(x)
        # if self.pretrain:
        #     return reps, masked_indicies
        return reps, x

    #From huggingface
    def get_feat_extract_output_lengths(self, input_length):
        """
        Computes the output length of the convolutional layers
        """
        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
            return (input_length - kernel_size) // stride + 1
        for kernel_size, stride in zip(self.wav2vec2.config.conv_kernel, self.wav2vec2.config.conv_stride):
            input_length = _conv_out_length(input_length, kernel_size, stride)
        return input_length

    def forward(self, x, length=None):
        x, _ = self.__forward_wrapper(x, length)
        logits = self.rnn_head(x)
        return logits



In [None]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorCTCWithPadding:
  processor: Wav2Vec2Processor

  def __call__(
      self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
  ) -> Dict[str, torch.Tensor]:
      #TODO mis valores tienen otros nombres....
    input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
    label_features = [{"input_values": feature["labels"]} for feature in features]

    batch = processor.pad(
        input_ids=input_ids, labels=label_features, return_tensors="pt"
    )

    batch["labels"] = batch["labels"].masked_fill(
        batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
    )

    # batch = self.processor.pad(
    #     input_features,
    #     padding=self.padding,
    #     max_length=self.max_length,
    #     pad_to_multiple_of=self.pad_to_multiple_of,
    #     return_tensors="pt",
    # )

    del batch["decoder_attention_mask"]

    # if model.config.reduction_factor > 1:
    if True:
      target_lengths = torch.tensor(
          [len(feature["input_values"]) for feature in features]
      )
      target_lengths = target_lengths.new(
          [
              length - length % model.config.reduction_factor
              for length in target_lengths
          ]
      )
      max_length = max(target_lengths)
      batch["labels"] = batch["labels"][:, :max_length]

    return batch


In [None]:
 def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        # d_type = torch.long if isinstance(label_features[0], int) else torch.float
        d_type = torch.float

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [26]:
import seaborn as sns
import numpy as sns
import torch
import torch.nn as nn
import torch.optim as optim
import sklearn
from sklearn import datasets
import pandas as pd

# from sklearn we are going to select one dataset
data=datasets.make_regression()
df = pd.DataFrame(data[0], columns=[f"feature_{i+1}" for i in range(data[0].shape[1])])
df["target"] = data[1]

In [27]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,target
0,-0.286112,-0.227547,0.060912,-0.527556,0.336521,1.957472,-0.490795,-0.791163,0.053066,-0.741370,...,-0.511413,0.103469,-0.230740,1.632683,0.156575,-1.281473,0.940699,-1.622489,-0.619227,-263.288785
1,0.709329,0.629078,-1.086846,-0.272732,-0.800999,1.872184,-0.098388,-1.361171,1.004658,-1.028911,...,-1.216538,0.416280,1.158405,-0.076967,-1.018067,-0.132490,1.154801,0.249386,-0.658892,-250.970827
2,1.758989,1.333239,-2.227748,0.384178,-1.187588,-1.063517,0.119444,-0.638148,-1.088296,1.583814,...,0.712170,-0.107792,-0.064773,0.419378,-0.495824,1.602610,0.239052,-1.404822,-0.537853,-126.764027
3,0.273393,-0.903034,-0.638108,0.316754,0.758953,0.255189,-2.377217,0.743976,0.696577,0.869690,...,-1.199667,-0.286335,0.378532,0.636043,-0.992250,-0.340602,-1.130861,-2.086161,0.557614,-69.408991
4,-0.786637,0.832770,1.497535,-0.434381,1.271752,-0.375472,0.793456,-0.728879,0.918004,-0.925729,...,0.226319,0.049469,1.619766,-2.298836,0.897172,-1.159325,0.637243,0.049598,-0.733325,327.762016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.190425,-1.532839,0.902549,-1.010177,0.471228,-1.918017,-0.961874,0.374719,-0.426908,0.909956,...,0.100390,-1.239952,0.356771,-0.148658,0.160532,-0.606366,-0.530584,-0.490669,-1.503241,-73.597495
96,-0.429485,-0.408459,-1.039581,-0.349705,0.614120,0.646206,-0.558783,-1.105074,0.137042,-1.218977,...,0.462980,0.757735,1.330670,-0.915429,1.117952,-0.812674,-1.527821,-0.454909,-0.814083,56.798714
97,-1.062492,-0.801876,-0.577942,-0.141085,-0.858107,-1.143332,0.107749,1.156217,0.555643,-0.007473,...,-0.843672,-0.381295,1.353650,-0.432319,0.109488,-1.595968,-0.074187,-0.635157,-0.719713,55.235899
98,-0.674853,-0.835861,-0.590816,-0.868309,-1.592500,-1.768829,0.238152,-1.230646,0.570827,-0.352008,...,-1.432527,-0.268518,1.285197,0.094691,0.366519,-1.339173,1.528694,-0.274286,-1.264998,-151.777998


In [28]:
x=df.iloc[: , :-1]
y=df.iloc[: , -1]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(type(X_train))
# X_train=torch.tensor(X_train,dtype=torch.float32)

X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

<class 'pandas.core.frame.DataFrame'>


In [31]:
X_train

tensor([[-0.7265, -0.4876, -0.2271,  ...,  0.3313,  0.6820, -1.0102],
        [ 0.4640, -1.5362,  0.8113,  ...,  1.0891, -2.3823,  1.3926],
        [-0.4471,  0.1878,  0.6282,  ...,  1.4426,  1.0689,  0.2509],
        ...,
        [-3.1551, -0.0806,  1.1353,  ...,  1.9348,  0.2011, -0.6807],
        [ 0.6756,  1.3626,  1.2571,  ..., -0.5133,  2.4053,  2.0576],
        [ 0.2604,  1.9905, -2.1568,  ...,  0.7759, -0.8794,  0.2144]])

In [37]:
 # since data is ready we can develop the model:

class linearRegression(nn.Module): # all the dependencies from torch will be given to this class [parent class] # nn.Module contains all the building block of neural networks:
  def __init__(self,input_dim):
    super(linearRegression,self).__init__()  # building connection with parent and child classes
    self.fc1=nn.Linear(input_dim,10)          # hidden layer 1
    self.fc2=nn.Linear(10,5)                  # hidden layer 2
    self.fc3=nn.Linear(5,3)                   # hidden layer 3
    self.fc4=nn.Linear(3,1)                   # last layer

  def forward(self,d):
    out=torch.relu(self.fc1(d))              # input * weights + bias for layer 1
    out=torch.relu(self.fc2(out))            # input * weights + bias for layer 2
    out=torch.relu(self.fc3(out))            # input * weights + bias for layer 3
    out=self.fc4(out)                        # input * weights + bias for last layer
    return out                               # final outcome

input_dim=X_train.shape[1]
torch.manual_seed(42)  # to make initilized weights stable:
model=linearRegression(input_dim)


# select loss and optimizers

loss=nn.MSELoss() # loss function
optimizers=optim.Adam(params=model.parameters(),lr=0.01)



# training the model:

num_of_epochs=1
for i in range(num_of_epochs):
  # give the input data to the architecure
  y_train_prediction=model(X_train)  # model initilizing
  loss_value=loss(y_train_prediction.squeeze(),y_train)   # find the loss function:
  print(X_train)
  print(y_train_prediction.squeeze())
  print(y_train)
  optimizers.zero_grad() # make gradients zero for every iteration so next iteration it will be clear
  loss_value.backward()  # back propagation
  optimizers.step()  # update weights in NN

  # print the loss in training part:
  if i % 10 == 0:
    print(f'[epoch:{i}]: The loss value for training part={loss_value}')

tensor([[-0.7265, -0.4876, -0.2271,  ...,  0.3313,  0.6820, -1.0102],
        [ 0.4640, -1.5362,  0.8113,  ...,  1.0891, -2.3823,  1.3926],
        [-0.4471,  0.1878,  0.6282,  ...,  1.4426,  1.0689,  0.2509],
        ...,
        [-3.1551, -0.0806,  1.1353,  ...,  1.9348,  0.2011, -0.6807],
        [ 0.6756,  1.3626,  1.2571,  ..., -0.5133,  2.4053,  2.0576],
        [ 0.2604,  1.9905, -2.1568,  ...,  0.7759, -0.8794,  0.2144]])
tensor([0.5258, 0.5258, 0.5258, 0.5104, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258,
        0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5259, 0.5258, 0.5258, 0.5258,
        0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258,
        0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258,
        0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258,
        0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5216, 0.5258,
        0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258, 0.5258,
      

In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2Processor
from torch.utils.data import DataLoader, Dataset

chunk_size=15

chk_name = "/Users/beltre.wilton/apps/mspconv_ftlab/checkpoint/checkpoint-204"
wav2vec2 = Wav2Vec2ForPreTraining.from_pretrained(chk_name, output_hidden_states=True).wav2vec2.to(device=device)
processor = Wav2Vec2Processor.from_pretrained(chk_name)
model = Wav2vec2Wrapper(wav2vec2=wav2vec2, chunk_size=chunk_size).to(device=device)
data_collator = DataCollatorCTCWithPadding(processor=processor)


