In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from IPython.core.interactiveshell import InteractiveShell
import warnings 
from tqdm.notebook import tqdm
InteractiveShell.ast_node_interactive = "all"
warnings.filterwarnings("ignore")

### Theory

Consider a system where:

\begin{equation}
h^t= f (h^{t-1},\theta)  
\end{equation}
\begin{equation}
\theta \text{: parameters shared across all time steps}
\end{equation}

That is, its state at time step t, is dependent only on a set a parameters and the previous state at t-1
<br>
<br>
Let the state of the system, h, also be depedent on an input at the respective time step, x:

\begin{equation}
h^t= f (h^{t-1},x^{t},\theta)  
\end{equation}

The state h now contains information about the entire past history of inputs, x.

Consider now a system that given the hidden state, h,produces an output o, for each time step. This output is passed to an activation function made to predict the target, y, at the respective time step.

\begin{equation}
o^t= g (h^{t},\theta')  
\end{equation}
\begin{equation}
\theta' \text{: a different set of parameters as $\theta$}
\end{equation}


We define now define $\theta$ and $\theta'$ as the weight matrices describing the relation between the input-to-hidden, hidden-to-hidden and hidden-to-output notes; $U$, $W$ and $V$:

\begin{equation}
z^t=  W^{T}h^{t-1} + U^{T}x^t +b 
\end{equation}

\begin{equation}
h^{t} = \phi(z^t)
\end{equation}

\begin{equation}
o^t = V^Th^{t} + c
\end{equation}

Where $b$ and $c$ are biases, $\phi$ is an activation function. <br><br>
**Note**: matrices $U$, $W$ and $V$ are not indexed by time. 

Then for each time step, we have a sequential total loss up to time step $\tau$, $L^\tau$, defined as the difference between our prediction and the target, at each output, upto the time step $\tau$
<br>
<br>
Consider the task of multi-class classification. 
<br>
<br>
Consequently, the output activation function is the normalized expontential function, a.k.a the _softmax function_

\begin{equation}
L = \sum_{t=1}^{\tau} l\big(o^{t}\big)
\text{: Total loss upto time step $\tau$}  
\end{equation}

\begin{equation}
\hat{y}^t_i = \frac{\exp(o_i^t)}{\sum_{j}\exp(o_j^t)}
\text{: Softmax activation function for multi-class classification}
\end{equation}

**NOTE** the softmax is a vector function, later when taking the derivative, in reality I am finding the Jacobian of it in its vector form, but here I denote one element of it, the $i^{th}$

\begin{equation}
l = - \sum_{m=0}^{M-1}y_{m}^{t} \log\Big(\hat{y}_{m}^{t}\Big)
\text{: M categorical cross entropy for predictions at time step $t$}
\end{equation}


The optimization process differs from standard back-propagation (like descirbed for a vanilla feedforward network). Usng the above assumptions, I will go through the derivation analogous optimization process for recurrent networks;

### Back propagation through time

Per example loss w.r.t to the output element $o_i$ at time $t$; $o_i^{t}$

\begin{equation}
\nabla_{o_{i}^{t}} L = \frac{\partial{L}}{\partial{l(o_i^t)}} \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}}
\end{equation}
Note that:
\begin{equation}
 \frac{\partial{L}}{\partial{l(o_i^t)}} = 1
\end{equation}
and that:
\begin{equation}
 \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}}
\end{equation}
is the derivative of the categorical cross-entropy
\begin{equation}
\boxed{
 \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = - \sum_{j} \frac{y_j^{t}}{\hat{y}_j^{t}}\frac{\partial{\hat{y}^{t}_j}}{\partial{o_i^{t}}} } - [1]
\end{equation}
The softmax functions is:
 \begin{equation}
 \hat{y}^t_i = \frac{\exp(o_i^t)}{\sum_{j}\exp(o_j^t)}
\end{equation}
Taking its derivative gives:
\begin{equation}
\boxed{
    \frac{\partial{\hat{y}^{t}_i}}{\partial{o_j^{t}}} = \hat{y}^{t}_{i} \Big( \delta_{ij}  -  \hat{y}^{t}_{j} \Big)
}- [2]
\end{equation}
_look at the different cases to see why this is true_ i.e. $i=j$ and $i \neq j$
<br><br>
Lets sub [2] into [1], and splitting into the cases where $i=j$ and $i \neq j $:

 \begin{equation}
 \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = - \sum_{j} \frac{y_j^{t}}{\hat{y}_j^{t}} \hat{y}^{t}_{j} \Big(\delta_{ij}  -  \hat{y}^{t}_{i} \Big)
\end{equation}

 \begin{equation}
 \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = - \sum_{j} y_j^{t} \Big(\delta_{ij}  -  \hat{y}^{t}_{i} \Big)
\end{equation}
 
Lets now split the sum up for the two cases;

\begin{equation}
 \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} =  \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} \Bigr|_{j=i} + \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} \Bigr|_{j \neq i}  =  -y^{t}_{i}(\delta_{ii} - \hat{y}_{i})^{t} - \sum_{j \neq i} y_j^{t} \Big(\delta_{ij}  -  \hat{y}^{t}_{i} \Big)
\end{equation} 
Simplfying down: 

\begin{equation}
 
 \frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} =  -y^{t}_{i}(1 - \hat{y}_{i})^{t} - \sum_{j \neq i} y_j^{t} \Big( 0 -\hat{y}^{t}_{i} \Big)
\end{equation} 

\begin{equation}
\frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = \sum_{j \neq i} y_j^{t} \hat{y}^{t}_{i}  -y^{t}_{i}(1 - \hat{y}_{i})^{t} 
\end{equation} 


\begin{equation}
\frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = \sum_{j \neq i} y_j^{t} \hat{y}^{t}_{i}+y^{t}_{i}\hat{y}_{i}^{t}  -y^{t}_{i}
\end{equation} 
Recall that $\sum_{j} y_j = 1$
\begin{equation}
\frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = \sum_{j \neq i} \Big( y_j^{t} +y^{t}_{i} \Big) \hat{y}^{t}_{i}  -y^{t}_{i}
\end{equation} 


\begin{equation}
\frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} = \sum_{j} \Big( y_j^{t} \Big) \hat{y}^{t}_{i}  -y^{t}_{i}
\end{equation} 


\begin{equation}
\boxed{
\frac{\partial{l(o_i^t)}}{\partial o_{i}^{t}} =  \hat{y}^{t}_{i}  -y^{t}_{i}
}
\end{equation} 

Next, lets calculate the gradient on the internel nodes $h^t$ from the end of the sequence $\tau$.
<br>
I am going to use vector notation here on out. I.e. $h_i^{t}$ becomes $h^t$ 

\begin{equation}
\nabla_{h^\tau} L = \Bigg( \frac{ \partial{o^{\tau}}}
{\partial{h^{\tau}}} \Bigg)^{T} \nabla_{o^\tau} L
\end{equation}

\begin{equation}
\nabla_{h^\tau} L = V \nabla_{o^\tau} L
\end{equation}
we iterate backwards through time. Note the dependency of $h^t$ on both $o^t$ and $h^{t+1}$


\begin{equation}
\nabla_{h^t} L = \Bigg( \frac{ \partial{h^{t+1}}}
{\partial{h^{t}}} \Bigg)^{T} \nabla_{h^{t+1}} L +
\Bigg( \frac{ \partial{o^{t}}}
{\partial{h^{t}}} \Bigg)^{T} \nabla_{o^{t}} L 
\end{equation}




The derivate of the hidden units  w.r.t their previous time step is:

\begin{equation}
 \frac{ \partial{h^{t+1}} }
{\partial{h^{t}} }  =  \frac{ \partial{h^{t+1}} }{ \partial{z^{t+1} } }
\frac{ \partial{z^{t+1} } } { \partial{h^{t}} }
\end{equation}
This leads to:

\begin{equation}
 \frac{ \partial{h^{t+1}} }
{\partial{h^{t}} }  =  diag\Bigg( \phi'\big(z^{t+1}\big) \Bigg) W^T
\end{equation}
**Note** diag: considering only the leading diagonal values and setting all others to 0. 
<br><br>
For RNNs , we want to use a saturating activation to avoid gradient explosions <br><br>
e.g. hyperbolic tagent. 

\begin{equation}
\nabla_{h^t} L = W  diag \Big( \phi'\big(z^{t+1}\big) \Big)   \nabla_{h^{t+1}} L +
V \nabla_{o^{t}} L 
\end{equation}


Lets specify the activation function (using the hyperpolic tagent)

\begin{equation}
\nabla_{h^t} L = W  diag \Big( 
     1 - \big(h^{t+1}\big)^2
    \Big)  \nabla_{h^{t+1}} L +
V \nabla_{o^{t}} L 
\end{equation}

Now for the gradients on the biases $b$ and $c$

\begin{equation}
\nabla_{c} L  = \sum_{t} \Bigg(
     \frac{\partial{o^t}}{\partial{c^t}} 
     \Bigg)^{T} \nabla_{o^t} L
\end{equation}
since $\frac{\partial{o^t}}{\partial{c^t}} = 1$

\begin{equation}
\nabla_{c} L  = \sum_{t} \nabla_{o^t} L
\end{equation}
Next:
\begin{equation}
\nabla_{b} L  = \sum_{t}  \Bigg(
     \frac{\partial{h^t}}{\partial{b^t}} 
     \Bigg)^{T}  \nabla_{h^t} L
\end{equation}
Since $b$ is dependent on h through the activation function $\phi$, we have: 

Next:
\begin{equation}
\nabla_{b} L  = \sum_{t}  diag \Bigg( \phi' \Big( z^t \Big) \Bigg) \nabla_{h^t} L
\end{equation}

The derivative w.r.t to $V$; the hidden-ouput matrix 

\begin{equation}
\nabla_{V} L  = \sum_{t} \sum_{i}  \Bigg(
    \frac{ \partial L}{ \partial o_{i}^t}
     \Bigg)^T \nabla_{V} O_i^{t}
\end{equation}
Leading to:
\begin{equation}
\boxed{
\nabla_{V} L  = \sum_{t} h^t \Big(\nabla_{o^t} L \Big)^T
}
\end{equation}

For the derivative w.r.t the weight matrices $W$ and $U$, we introduce dummy variables $W^t$ and $U^t$. These are copies of each other at each time step, summing these up will give us the total gradient. 


\begin{equation}
\nabla_{W} L  = \sum_{t} \sum_{i}  \Bigg(
    \frac{ \partial L}{ \partial h_{i}^t}
     \Bigg)^T \nabla_{W^t} h_i^{t}
\end{equation}
giving: 
\begin{equation}
\boxed{
\nabla_{W} L  = \sum_{t} h^{t-1} \Big(\nabla_{h^t} L \Big)^T  diag \Bigg( \phi ' \big(z^t \big) \Bigg)

}
\end{equation}
for the derivative of w.r.t $U$:

\begin{equation}
\nabla_{U} L  = \sum_{t} \sum_{i}  \Bigg(
    \frac{ \partial L}{ \partial h_{i}^t}
     \Bigg)^T \nabla_{U^t} h_i^{t}
\end{equation}
giving: 
\begin{equation}
\boxed{
\nabla_{U} L  = \sum_{t} x^{t} \Big( \nabla_{h^t} L \Big)^T 
     diag \Bigg( \phi ' \big(z^t \big) \Bigg)

}
\end{equation}


# Modelling RNN with backpropagation

In [4]:
class RNN:
    def __init__(self, input_dim, output_dim, hidden_dim=128):
        # network variables 
        self.idim = input_dim
        self.hdim = hidden_dim
        self.odim = output_dim
        # initialise weights 
        self.U = np.random.uniform(- np.sqrt(1./self.idim),
                                     np.sqrt(1./self.idim),
                                    (self.idim, self.hdim) )

        self.V = np.random.uniform( -np.sqrt(1./self.hdim),
                                     np.sqrt(1./self.hdim), 
                                    (self.hdim,self.odim))

        self.W = np.random.uniform( -np.sqrt(1./self.hdim),
                                     np.sqrt(1./self.hdim), 
                                    (self.hdim,self.hdim))

        self.b = np.zeros(self.hdim)
        self.c = np.zeros(self.odim)
    

    def softmax(self,x):
        '''Note that this is a numerically stable version of softmax.
        
        We substract the max value from all elements.
        Overflow of a single element, or underflow of all elements,  will render the output usless.
        
        subtracting max leaves only non-positive values ---> no overflow 
        at least one element = 0 ---> no vanishing denominator (underflow is some enteries is okay) 
         '''
        xt = np.exp(x-np.max(x))
        return xt / np.sum(xt)

    def forward(self,x):
        # Single example pass forward, all the way through the network
        T = len(x)
        # will stack as rows
        h = np.zeros((T,self.hdim))
        o = np.zeros((T,self.odim))
        for t in range(T):
            # print("x[t], inside forward", x[t])
            h[t] = self.U.T @ x[t] + self.b
            # print("h[t], inside forward", h[t])
            # add contribution from previous time step
            if t > 1:
                h[t] += self.W @ h[t-1] + self.b
            h[t] = np.tanh(h[t])
            # print("foward softmax output", self.softmax( self.V.T @ h[t] + self.c))
            o[t] = self.softmax( self.V.T @ h[t] + self.c)

        return (o,h)

    def backward(self, x, y, clip=None):
        T = len(x)
        o,h = self.forward(x)
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        dLdb = np.zeros(self.b.shape)
        dLdc = np.zeros(self.c.shape)
        # dL/do
        delta_o = o
        delta_o[ np.arange(len(y)), y ] -= 1.
        # dL/dh
        delta_h = np.zeros((T, self.hdim))
       
        for t in reversed(range(T)):
            # collect errors on hidden states
            delta_h[t] = self.V @ delta_o[t,:]
            if t < T-1:
                # collect errors on hidden states due to W
                delta_h[t] = ( self.W @ np.diag(1-h[t+1]**2) ) @ delta_h[t+1]


        for t in range(T):
            # error on ouput bias
            dLdc += delta_o[t,:]
            # error on hidden bias 
            dLdb += (1-h[t]**2) * delta_h[t,:]
            # error on hidden-output matrix
            
            ot = delta_o[t,:][...,np.newaxis]
            ht = h[t,:][...,np.newaxis]            
            dht = delta_h[t,:][...,np.newaxis]
                

            dLdV += ht @ ot.T 
            # error on hidden-hidden W
            if t > 0 :
                h_t = h[t-1,:][...,np.newaxis]
                dLdW += ( h_t @ dht.T )@np.diag(1-h[t]**2)

            xt = x[t][...,np.newaxis]
            dLdU += xt @ dht.T @ np.diag(1-h[t]**2)

        if clip is not None:
            dLdb = np.clip(dLdb, -clip, clip)
            dLdc = np.clip(dLdc, -clip, clip)
            dLdV = np.clip(dLdV, -clip, clip)
            dLdW = np.clip(dLdW, -clip, clip)
            dLdU = np.clip(dLdU, -clip, clip)

        return (dLdU, dLdV, dLdW, dLdb, dLdc)



    def step(self,x,y,lr=0.01):
        dLdU, dLdV, dLdW, dLdb, dLdc = self.backward(x,y)
        self.U -= lr * dLdU
        self.V -= lr * dLdV
        self.W -= lr * dLdW 
        self.b -= lr * dLdb 
        self.c -= lr * dLdc 
    

    def Loss(self, x,y):
        o,h = self.forward(x)
        # print("output layer output:", o.shape)
        # print("Indexed at y ", o[np.arange(len(y)), y].shape)
        
        LOSS = -np.sum(o[np.arange(len(y)), y])
        # print("Loss indexed using arange and y: ", LOSS)
        return LOSS

# Classifying Bird sounds

## Helper functions

Going to cap the audio file to be 30 seconds long. Any below length in time will be padded with zeros, any above chopped down to 30 seconds. 

In [5]:
import torchaudio
import torch 
import matplotlib.pyplot as plt 

SAMPLE_MP3_PATH = ("/home/akinwilson/Projects/bird-sound-classifier/data/birds/Phylloscopuscollybita/Poland"
"/Phylloscopuscollybita325319.mp3")
metadata = torchaudio.info(SAMPLE_MP3_PATH)

def print_stats(waveform, sample_rate=None, src=None):
    if src:
        print("-" * 10)
        print("Source:", src)
        print("-" * 10)
    if sample_rate:
        print("Sample Rate:", sample_rate)
    print("Shape:", tuple(waveform.shape))
    print("Dtype:", waveform.dtype)
    print(f" - Max:     {waveform.max().item():6.3f}")
    print(f" - Min:     {waveform.min().item():6.3f}")
    print(f" - Mean:    {waveform.mean().item():6.3f}")
    print(f" - Std Dev: {waveform.std().item():6.3f}")
    print()
    print(waveform)
    print()


def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].set_xlabel("Time [s]")
        axes[c].set_ylabel("Amplitude")
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
        if ylim:
            axes[c].set_ylim(ylim)
        
    figure.suptitle(title)
    plt.show(block=False)

def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None):
    waveform = waveform.numpy()
    num_channels, _ = waveform.shape
    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate, sides="onesided")
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
        axes[c].set_xlabel("Time [s]")
        axes[c].set_ylabel("Frequency [Hz]")
    figure.suptitle(title)
    plt.show(block=False)


def _get_sample(path, resample=None):
    effects = [["remix", "1"]]
    if resample:
        effects.extend(
            [
                ["lowpass", f"{resample // 2}"],
                ["rate", f"{resample}"],
            ]
        )
    return torchaudio.sox_effects.apply_effects_file(path, effects=effects)


def get_sample(SAMPLE_WAV_PATH, resample=None):
    return _get_sample(SAMPLE_WAV_PATH, resample=resample)

def get_dataset(name):
    df = pd.DataFrame.from_dict(pickle.load(open(f"./data/{name}.p", "rb")).items())
    return df 

SAMPLE_RATE = 8000

waveform, sample_rate = get_sample(SAMPLE_MP3_PATH, resample=SAMPLE_RATE)
# print("sampling rate", sample_rate)
# print_stats(waveform, sample_rate=sample_rate)
# plot_waveform(waveform, sample_rate)
# plot_specgram(waveform, sample_rate)
import pickle 
# label2id = {v:k for k,v in enumerate(get_dataset("total")[1].unique())}
# pickle.dump(label2id, open("./data/label2id.p", "wb"))



In [6]:
MAX_TIME = 5
from sklearn import preprocessing 

scaler = preprocessing.StandardScaler()

def align(torch_tensor,sample_rate, max_time = MAX_TIME):
    '''
    Padding or cutting audio file upto/downto max_time seconds
    '''
    if torch_tensor.shape[1] >  max_time * sample_rate:
        X =  torch_tensor[0,: max_time * sample_rate].unsqueeze(axis=0)
        assert X.shape == torch.Size([1, max_time * sample_rate]), f"miss match in return dim,\nexpected {torch.Size([1, max_time * sample_rate])},\n got {X.shape}"
        return X 
    elif torch_tensor.shape[1] <=  max_time * sample_rate:
        padding_num = max_time * sample_rate - torch_tensor.shape[1]
        X = torch.concat( (torch_tensor[0,:],torch.tensor([0]*padding_num) ) ).unsqueeze(axis=0)
        assert X.shape == torch.Size([1, max_time * sample_rate]), f"miss match in return dim,\nexpected {torch.Size([1, max_time * sample_rate])},\n got {X.shape}"
        return X 



scaler = preprocessing.StandardScaler()
# standardized_x = scaler.fit_transform(x)

def spectogram2seq(spectogram, label, to_numpy=True):
    ''' Going to bucketise delta t vertical strips of the spectogram generated '''
    slice = 10
    features, labels = [], []
    for i in range(spectogram.shape[1]):
        x = torch.mean(spectogram.squeeze().T[:,i:i+slice], dim=1)
        if torch.isnan(x).any():
            print(f"found nans on index: {i}")
            break
        else:
            if to_numpy:
                x = np.nan_to_num(x.numpy())
                assert np.isnan(x).any()  == False, "Print found nan values"
                assert np.isneginf(x).any() == False and np.isposinf(x).any() == False, "found inifity values in x"
                feat_standardised =   scaler.fit_transform(x[...,np.newaxis])[:,0]
                features.append(feat_standardised)
            labels.append(label)
            continue     
    return features, labels


def get_training_sample(path,label):
    waveform, sample_rate = get_sample(path, resample=SAMPLE_RATE)
    waveform = align(waveform, sample_rate=sample_rate, max_time=MAX_TIME)
    trans2spectrogram = torchaudio.transforms.Spectrogram(n_fft=256)
    spec = trans2spectrogram(waveform).log2()
    training_sample = spectogram2seq(spec, label=label)
    return  training_sample


# df = get_dataset("total")

def data_loader(dev_run=True):
    df = get_dataset("total").sample(n=5) if dev_run else get_dataset("total")
    label2id = pickle.load(open("./data/label2id.p", "rb"))
    X_tot = []
    for idx, row in df.iterrows():
        path, label = row[0], row[1]
        labelId = label2id[label]
        X_tot.append(get_training_sample(path, labelId))
    return X_tot 

# Training the model

In [8]:
loss_history = []
MAX_EPOCHS = 100
LR=0.01

model = RNN(input_dim=313, output_dim=19, hidden_dim=64)
X = data_loader(dev_run=True)

for epoch in range(MAX_EPOCHS):
    print(f"Starting echo: {epoch}")
    loss = 0
    print(f"number of sequence pairs: {len(X)}")
    for pair in tqdm(X):
        x,y  = pair
        loss += model.Loss(x, y)
        model.step(x, y, lr=LR)
        loss = loss / len(x)
    print(f"Epoch {epoch} Loss {loss}")
    loss_history.append(loss)

Starting echo: 0
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 0 Loss -1.8792177159433945e-06
Starting echo: 1
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 Loss -1.3519733119506076e-24
Starting echo: 2
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2 Loss -4.6694100830015505e-31
Starting echo: 3
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3 Loss -1.5333308736112068e-34
Starting echo: 4
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4 Loss -1.9387722077641305e-34
Starting echo: 5
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5 Loss -1.9128101524606286e-31
Starting echo: 6
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 6 Loss -1.3021111562645196e-34
Starting echo: 7
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 7 Loss -4.609982767405766e-33
Starting echo: 8
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 8 Loss -9.895605273336343e-32
Starting echo: 9
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 9 Loss -8.723371783590374e-33
Starting echo: 10
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 10 Loss -5.486459801273051e-35
Starting echo: 11
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 11 Loss -5.190961685651037e-35
Starting echo: 12
number of sequence pairs: 5


  0%|          | 0/5 [00:00<?, ?it/s]

# Data splitting

In [None]:
def get_dataset(name):
    df = pd.DataFrame.from_dict(pickle.load(open(f"./data/{name}.p", "rb")).items())
    return df 


def create_clean_ds():
    files = get_file_paths()
    pool = mp.Pool(mp.cpu_count()-2)
    files_clean = pool.map(test_load_audio,files )
    files_clean = [f for f in files_clean if f is not None]
    x = dict([get_label_and_path(f) for f in files_clean])
    pickle.dump(file=open("./data/total.p","wb"), obj=x)
    

def create_train_test_split(df):
    return train_test_split( df[0], df[1], test_size=0.33, random_state=42)

Xtrain, Xtest, Ytrain, Ytest =  create_train_test_split(get_dataset("total"))

def save_train_test(data_dict, name):
    pickle.dump(data_dict, open(f"./data/{name}", "wb"))

# train = dict(zip(Xtrain, Ytrain))
# test = dict(zip(Xtest, Ytest))
# save_train_test(train, "train.p")
# save_train_test(test,"test.p")