#Requirements

In [1]:
with open('requirements.txt', 'w') as f:
    f.write('torch\ntorchvision\ntorchaudio\npraat-parselmouth\ntransformers\nomegaconf\npytorch_lightning\ntqdm\ntensorboard\nlibrosa >= 0.8.0')

In [2]:
!pip install -r requirements.txt

Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (10.7 MB)
[K     |████████████████████████████████| 10.7 MB 8.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 72.3 MB/s 
[?25hCollecting omegaconf
  Downloading omegaconf-2.1.2-py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 3.4 MB/s 
[?25hCollecting pytorch_lightning
  Downloading pytorch_lightning-1.6.3-py3-none-any.whl (584 kB)
[K     |████████████████████████████████| 584 kB 88.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 77.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 k

#Utils

In [3]:
!git clone https://github.com/arifahmad-py/ASR-hacker.git

Cloning into 'ASR-hacker'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 32 (delta 10), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (32/32), done.


In [4]:
!mv /content/ASR-hacker/utils /content

#yingram calculation model

In [5]:
# adapted from https://github.com/patriceguyot/Yin
# https://github.com/NVIDIA/mellotron/blob/master/yin.py

import numpy as np
import torch

In [6]:
def differenceFunction(x, N, tau_max):
    """
    Compute difference function of data x. This corresponds to equation (6) in [1]
    This solution is implemented directly with Numpy fft.
    :param x: audio data
    :param N: length of data
    :param tau_max: integration window size
    :return: difference function
    :rtype: list
    """

    x = np.array(x, np.float64)
    w = x.size
    tau_max = min(tau_max, w)
    x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum()))
    size = w + tau_max
    p2 = (size // 32).bit_length()
    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
    fc = np.fft.rfft(x, size_pad)
    conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
    return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv


In [7]:
def cumulativeMeanNormalizedDifferenceFunction(df, N, eps=1e-8):
    """
    Compute cumulative mean normalized difference function (CMND).
    This corresponds to equation (8) in [1]
    :param df: Difference function
    :param N: length of data
    :return: cumulative mean normalized difference function
    :rtype: list
    """
    np.seterr(divide='ignore', invalid='ignore')
    # scipy method, assert df>0 for all element
    cmndf = df[1:] * np.asarray(list(range(1, N))) / (np.cumsum(df[1:]).astype(float) + eps)
    return np.insert(cmndf, 0, 1)


In [8]:
def differenceFunctionBatch(xs: np.ndarray, N, tau_max):
    """numpy backend batch-wise differenceFunction
    Args:
        xs: audio segments, np.ndarray of shape (B x t)
        N:
        tau_max:
    Returns:
        y: dF. np.ndarray of shape (B x tau_max)
    """
    xs = xs.astype(np.float64)
    w = xs.shape[-1]
    tau_max = min(tau_max, w)
    zeros = np.zeros((xs.shape[0], 1))
    x_cumsum = np.concatenate((np.zeros((xs.shape[0], 1)), (xs * xs).cumsum(axis=-1)), axis=-1)  # B x w
    size = w + tau_max
    p2 = (size // 32).bit_length()
    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)

    convs = []
    for i in range(xs.shape[0]):
        x = xs[i]
        fc = np.fft.rfft(x, size_pad)
        conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
        convs.append(conv)
    convs = np.asarray(convs)

    y = x_cumsum[:, w:w - tau_max:-1] + x_cumsum[:, w, np.newaxis] - x_cumsum[:, :tau_max] - 2 * convs
    return y


In [9]:
def cumulativeMeanNormalizedDifferenceFunctionBatch(dFs, N, eps=1e-8):
    """numpy backend batch-wise cumulative Mean Normalized Difference Functions
    Args:
        dFs: differenceFunctions. np.ndarray of shape (B x tau_max)
        N:
        eps:
    Returns:
        cMNDFs: np.ndarray of shape (B x tau_max)
    """
    arange = np.asarray(list(range(1, N)))[np.newaxis, ...]
    cumsum = np.cumsum(dFs[:, 1:], axis=-1).astype(float)
    cMNDFs = dFs[:, 1:] * arange / (cumsum + eps)
    cMNDFs = np.concatenate((np.zeros((cMNDFs.shape[0], 1)), cMNDFs), axis=1)
    return cMNDFs


In [10]:
def differenceFunctionTorch(xs: torch.Tensor, N, tau_max) -> torch.Tensor:
    """pytorch backend batch-wise differenceFunction
    has 1e-4 level error with input shape of (32, 22050*1.5)
    Args:
        xs:
        N:
        tau_max:
    Returns:
    """
    xs = xs.double()
    w = xs.shape[-1]
    tau_max = min(tau_max, w)
    zeros = torch.zeros((xs.shape[0], 1))
    x_cumsum = torch.cat(
        (torch.zeros((xs.shape[0], 1), device=xs.device), (xs * xs).cumsum(dim=-1, dtype=torch.double)),
        dim=-1)  # B x w
    size = w + tau_max
    p2 = (size // 32).bit_length()
    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)

    fcs = torch.fft.rfft(xs, n=size_pad, dim=-1)
    convs = torch.fft.irfft(fcs * fcs.conj())[:, :tau_max]
    y1 = torch.flip(x_cumsum[:, w - tau_max + 1:w + 1], dims=[-1])
    y = y1 + x_cumsum[:, w, np.newaxis] - x_cumsum[:, :tau_max] - 2 * convs
    return y


In [11]:
def cumulativeMeanNormalizedDifferenceFunctionTorch(dfs: torch.Tensor, N, eps=1e-8) -> torch.Tensor:
    arange = torch.arange(1, N, device=dfs.device, dtype=torch.float64)
    cumsum = torch.cumsum(dfs[:, 1:], dim=-1, dtype=torch.float64).to(dfs.device)

    cmndfs = dfs[:, 1:] * arange / (cumsum + eps)
    cmndfs = torch.cat(
        (torch.ones(cmndfs.shape[0], 1, device=dfs.device), cmndfs),
        dim=-1)
    return cmndfs

In [12]:
wav = torch.randn(32, int(22050 * 1.5)).cuda()
wav_numpy = wav.detach().cpu().numpy()
x = wav_numpy[0]

w_len = 2048
w_step = 256
tau_max = 2048
W = 2048

startFrames = list(range(0, x.shape[-1] - w_len, w_step))
startFrames = np.asarray(startFrames)
# times = startFrames / sr
frames = [x[..., t:t + W] for t in startFrames]
frames = np.asarray(frames)
frames_torch = torch.from_numpy(frames).cuda()

cmndfs0 = []
for idx, frame in enumerate(frames):
    df = differenceFunction(frame, frame.shape[-1], tau_max)
    cmndf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
    cmndfs0.append(cmndf)
cmndfs0 = np.asarray(cmndfs0)

dfs = differenceFunctionTorch(frames_torch, frames_torch.shape[-1], tau_max)
cmndfs1 = cumulativeMeanNormalizedDifferenceFunctionTorch(dfs, tau_max).detach().cpu().numpy()
print(cmndfs0.shape, cmndfs1.shape)
print(np.sum(np.abs(cmndfs0 - cmndfs1)))

(122, 2048) (122, 2048)
1.269745013829122e-10


#ecpa models

In [13]:
import torch
from torch import nn

In [14]:
class Conv1D_ReLU_BN(nn.Module):
    def __init__(self, c_in, c_out, ks, stride, padding, dilation):
        super(Conv1D_ReLU_BN, self).__init__()

        self.network = nn.Sequential(
            nn.Conv1d(c_in, c_out, ks, stride, padding, dilation),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(c_out),
        )

    def forward(self, x):
        y = self.network(x)
        return y

In [15]:
class Res2_Conv1D(nn.Module):
    def __init__(self, c, scale, ks, stride, padding, dilation):
        super(Res2_Conv1D, self).__init__()
        assert c % scale == 0
        self.c = c
        self.scale = scale
        self.width = c // scale

        self.convs = []
        self.bns = []

        for i in range(scale - 1):
            self.convs.append(nn.Conv1d(self.width, self.width, ks, stride, padding, dilation))
            self.bns.append(nn.BatchNorm1d(self.width))
        self.convs = nn.ModuleList(self.convs)
        self.bns = nn.ModuleList(self.bns)

    def forward(self, x):
        """
        param x: (B x c x d)
        """

        xs = torch.split(x, self.width, dim=1)  # channel-wise split
        ys = []

        for i in range(self.scale):
            if i == 0:
                x_ = xs[i]
                y_ = x_
            elif i == 1:
                x_ = xs[i]
                y_ = self.bns[i - 1](self.convs[i - 1](x_))
            else:
                x_ = xs[i] + ys[i - 1]
                y_ = self.bns[i - 1](self.convs[i - 1](x_))
            ys.append(y_)

        y = torch.cat(ys, dim=1)  # channel-wise concat
        return y

In [16]:
class Res2_Conv1D_ReLU_BN(nn.Module):
    def __init__(self, channel, scale, ks, stride, padding, dilation):
        super(Res2_Conv1D_ReLU_BN, self).__init__()

        self.network = nn.Sequential(
            Res2_Conv1D(channel, scale, ks, stride, padding, dilation),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(channel),
        )

    def forward(self, x):
        y = self.network(x)
        return y


In [17]:
class SE_Block(nn.Module):
    def __init__(self, c_in, c_mid):
        super(SE_Block, self).__init__()

        self.network = nn.Sequential(
            nn.Linear(c_in, c_mid),
            nn.ReLU(inplace=True),
            nn.Linear(c_mid, c_in),
            nn.Sigmoid(),
        )

    def forward(self, x):
        s = self.network(x.mean(dim=-1))
        y = x * s.unsqueeze(-1)
        return y

In [18]:
class SE_Res2_Block(nn.Module):
    def __init__(self, channel, scale, ks, stride, padding, dilation):
        super(SE_Res2_Block, self).__init__()
        self.network = nn.Sequential(
            Conv1D_ReLU_BN(channel, channel, 1, 1, 0, 1),
            Res2_Conv1D_ReLU_BN(channel, scale, ks, stride, padding, dilation),
            Conv1D_ReLU_BN(channel, channel, 1, 1, 0, 1),
            SE_Block(channel, channel)
        )

    def forward(self, x):
        y = self.network(x) + x
        return y

In [19]:
class AttentiveStatisticPool(nn.Module):
    def __init__(self, c_in, c_mid):
        super(AttentiveStatisticPool, self).__init__()

        self.network = nn.Sequential(
            nn.Conv1d(c_in, c_mid, kernel_size=1),
            nn.Tanh(),  # seems like most implementations uses tanh?
            nn.Conv1d(c_mid, c_in, kernel_size=1),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        # x.shape: B x C x t
        alpha = self.network(x)
        mu_hat = torch.sum(alpha * x, dim=-1)
        var = torch.sum(alpha * x ** 2, dim=-1) - mu_hat ** 2
        std_hat = torch.sqrt(var.clamp(min=1e-9))
        y = torch.cat([mu_hat, std_hat], dim=-1)
        # y.shape: B x (c_in*2)
        return y

In [20]:
class ECAPA_TDNN(nn.Module):
    def __init__(self, c_in=80, c_mid=512, c_out=192):
        super(ECAPA_TDNN, self).__init__()

        self.layer1 = Conv1D_ReLU_BN(c_in, c_mid, 5, 1, 2, 1)
        self.layer2 = SE_Res2_Block(c_mid, 8, 3, 1, 2, 2)
        self.layer3 = SE_Res2_Block(c_mid, 8, 3, 1, 3, 3)
        self.layer4 = SE_Res2_Block(c_mid, 8, 3, 1, 4, 4)

        self.network = nn.Sequential(
            # Figure 2 in https://arxiv.org/pdf/2005.07143.pdf seems like groupconv?
            nn.Conv1d(c_mid * 3, 1536, kernel_size=1, groups=3),
            AttentiveStatisticPool(1536, 128),
        )

        self.bn1 = nn.BatchNorm1d(3072)
        self.linear = nn.Linear(3072, c_out)
        self.bn2 = nn.BatchNorm1d(c_out)

    def forward(self, x):
        # x.shape: B x C x t
        y1 = self.layer1(x)
        y2 = self.layer2(y1) + y1
        y3 = self.layer3(y1 + y2) + y1 + y2
        y4 = self.layer4(y1 + y2 + y3) + y1 + y2 + y3

        y = torch.cat([y2, y3, y4], dim=1)  # channel-wise concat
        y = self.network(y)

        y = self.linear(self.bn1(y.unsqueeze(-1)).squeeze(-1))
        y = self.bn2(y.unsqueeze(-1)).squeeze(-1)

        return y


In [21]:
# Input size: batch_size * seq_len * feat_dim
x = torch.zeros(2, 80, 200)
model = ECAPA_TDNN(80, 512, 192)
out = model(x)
print(model)
print(out.shape)  # should be [2, 192]

ECAPA_TDNN(
  (layer1): Conv1D_ReLU_BN(
    (network): Sequential(
      (0): Conv1d(80, 512, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU(inplace=True)
      (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (layer2): SE_Res2_Block(
    (network): Sequential(
      (0): Conv1D_ReLU_BN(
        (network): Sequential(
          (0): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
          (1): ReLU(inplace=True)
          (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
      (1): Res2_Conv1D_ReLU_BN(
        (network): Sequential(
          (0): Res2_Conv1D(
            (convs): ModuleList(
              (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
              (1): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
              (2): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))


#Analysis models

In [22]:
import math

import numpy as np
import torch
import transformers

#from models.ecapa import ECAPA_TDNN
#from models.yin import *

In [23]:
class Linguistic(torch.nn.Module):
    def __init__(self, conf=None):
        print("INIT-L runs")
        """we use the intermediate features of XLSR-53 for linguistic features. More specifically, we used
        the output from the 12th layer of the 24-layer transformer encoder.
        Args:
            conf:
        """
        super(Linguistic, self).__init__()
        self.conf = conf

        self.wav2vec2 = transformers.Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-xlsr-53")
        for param in self.wav2vec2.parameters():
            param.requires_grad = False
            param.grad = None
        self.wav2vec2.eval()

    def forward(self, x):
        print("FORWARD_L runs")
        """
        Args:
            x: torch.Tensor of shape (B x t)
        Returns:
            y: torch.Tensor of shape(B x C x t)
        """
        with torch.no_grad():
            outputs = self.wav2vec2(x, output_hidden_states=True)
        y = outputs.hidden_states[12]  
        y = y.permute((0, 2, 1))  
        return y

    def train(self, mode: bool = True):
        print("TRAIN-L runs")
        if not isinstance(mode, bool):
            raise ValueError("training mode is expected to be boolean")
        self.training = mode
        # for module in self.children():
        #     module.train(mode)
        return self

In [24]:
class Speaker(torch.nn.Module):
    def __init__(self, conf=None):
        """A speaker embedding network that uses the 1st layer of XLSR-53 as an input.
        Args:
            conf:
        """
        super(Speaker, self).__init__()
        self.conf = conf

        self.wav2vec2 = transformers.Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-xlsr-53")
        for param in self.wav2vec2.parameters():
            param.requires_grad = False
            param.grad = None
        self.wav2vec2.eval()

        # c_in = 1024 for wav2vec2
        # original paper[14] used 512 and 192 for c_mid and c_out, respectively
        print("init runs")
        self.spk = ECAPA_TDNN(c_in=1024, c_mid=512, c_out=192)

    def forward(self, x):
        """
        Args:
            x: torch.Tensor of shape (B x t)
        Returns:
            y: torch.Tensor of shape (B x 192)
        """
        print("Forward runs")
        with torch.no_grad():
            outputs = self.wav2vec2(x, output_hidden_states=True)
        y = outputs.hidden_states[1]  
        y = y.permute((0, 2, 1))  
        y = self.spk(y)  
        y = torch.nn.functional.normalize(y, p=2, dim=-1)
        return y

    def train(self, mode: bool = True):
        print("Train runs")
        if not isinstance(mode, bool):
            raise ValueError("training mode is expected to be boolean")
        self.training = mode
        # for module in self.children():
        #     module.train(mode)
        self.spk.train(mode)
        return self

In [25]:
class Energy(torch.nn.Module):
    def __init__(self, conf=None):
        super(Energy, self).__init__()
        self.conf = conf

    def forward(self, mel):
        """For the energy feature, we simply took an average from a log-mel
         spectrogram along the frequency axis.
        Args:
            mel: torch.Tensor of shape (B x t x C)
        Returns:
            y: torch.Tensor of shape (B x 1 x C)
        """
        y = torch.mean(mel, dim=1, keepdim=True)  # B x 1(channel) x t
        return y

In [26]:
class Pitch(torch.nn.Module):
    def __init__(self, conf=None):
        print("STARTED")
        super(Pitch, self).__init__()
        self.conf = conf

    @staticmethod
    def midi_to_lag(m: int, sr: int, semitone_range: float = 12):
        print("STARTED TWO")
        """converts midi-to-lag, eq. (4)
        Args:
            m: midi
            sr: sample_rate
            semitone_range:
        Returns:
            lag: time lag(tau, c(m)) calculated from midi, eq. (4)
        """
        f = 440 * math.pow(2, (m - 69) / semitone_range)
        lag = sr / f
        print("midi-to-lag done")
        return lag

    @staticmethod
    def yingram_from_cmndf(cmndfs: torch.Tensor, ms: list, sr: int = 22050) -> torch.Tensor:
        """ yingram calculator from cMNDFs(cumulative Mean Normalized Difference Functions)
        Args:
            cmndfs: torch.Tensor
                calculated cumulative mean normalized difference function
            ms: list of midi(int)
            sr: sampling rate
        Returns:
            y:
                calculated batch yingram
        """
        c_ms = np.asarray([Pitch.midi_to_lag(m, sr) for m in ms])
        c_ms = torch.from_numpy(c_ms).to(cmndfs.device)
        c_ms_ceil = torch.ceil(c_ms).long().to(cmndfs.device)
        c_ms_floor = torch.floor(c_ms).long().to(cmndfs.device)

        y = (cmndfs[:, c_ms_ceil] - cmndfs[:, c_ms_floor]) / (c_ms_ceil - c_ms_floor).unsqueeze(0) * (
                c_ms - c_ms_floor).unsqueeze(0) + cmndfs[:, c_ms_floor]
        print("yingram_from_cmndf done")
        return y

    @staticmethod
    def yingram(x: torch.Tensor, W: int = 2048, tau_max: int = 2048, sr: int = 22050, w_step: int = 256):
        """calculates yingram from raw audio (multi segment)
        Args:
            x: raw audio, torch.Tensor of shape (t)
            W: yingram Window Size
            tau_max:
            sr: sampling rate
            w_step: yingram bin step size
        Returns:
            yingram: yingram. torch.Tensor of shape (80 x t')
        """
        # x.shape: t
        w_len = W

        startFrames = list(range(0, x.shape[-1] - w_len, w_step))
        startFrames = np.asarray(startFrames)
        # times = startFrames / sr
        frames = [x[..., t:t + W] for t in startFrames]
        frames_torch = torch.stack(frames, dim=0).to(x.device)

        # If not using gpu, or torch not compatible, implemented numpy batch function is still fine
        dfs = differenceFunctionTorch(frames_torch, frames_torch.shape[-1], tau_max)
        cmndfs = cumulativeMeanNormalizedDifferenceFunctionTorch(dfs, tau_max)

        midis = list(range(5, 85))
        yingram = Pitch.yingram_from_cmndf(cmndfs, midis, sr)

        print("yingram done")
        return yingram

    @staticmethod
    def yingram_batch(x: torch.Tensor, W: int = 2048, tau_max: int = 2048, sr: int = 22050, w_step: int = 256):
        """calculates yingram from batch raw audio.
        currently calculates batch-wise through for loop, but seems it can be implemented to act batch-wise
        Args:
            x: torch.Tensor of shape (B x t)
            W:
            tau_max:
            sr:
            w_step:
        Returns:
            yingram: yingram. torch.Tensor of shape (B x 80 x t')
        """
        batch_results = []
        for i in range(len(x)):
            yingram = Pitch.yingram(x[i], W, tau_max, sr, w_step)
            batch_results.append(yingram)
        result = torch.stack(batch_results, dim=0).float()
        result = result.permute((0, 2, 1)).to(x.device)
        print("yingram_batch done")
        return result

In [27]:
class Analysis(torch.nn.Module):
    def __init__(self, conf=None):
        """joins all analysis modules into one
        Args:
            conf:
        """
        super(Analysis, self).__init__()
        self.conf = conf

        self.linguistic = Linguistic()
        self.speaker = Speaker()
        self.energy = Energy()
        self.pitch = Pitch()

In [28]:
import torch

wav = torch.randn(2, 20000)
mel = torch.randn(2, 80, 128)

linguistic = Linguistic()
speaker = Speaker()
energy = Energy()
pitch = Pitch()

with torch.no_grad():
    lps = linguistic(wav)
    print(lps.shape)

    s = speaker(wav)
    print(s.shape)

    e = energy(mel)
    print(e.shape)

INIT-L runs


Downloading:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

init runs
STARTED
FORWARD_L runs
torch.Size([2, 1024, 62])
Forward runs
torch.Size([2, 192])
torch.Size([2, 1, 128])
