# DEEP SPEECH1 구현 - pytorch

CTC loss 를 이용한 deep speech 구현이다. <br>
구현 모델은 [Deep speech 이론 by YBigTa](https://github.com/YBIGTA/Deep_learning/blob/master/RNN/deep%20speech/%EC%84%A4%EB%AA%85/Deep%20speech_%EC%83%81%ED%97%8C.pdf) 에서 확인 가능하다. <br>

본 코드는 [deep speech2 implementation](https://github.com/SeanNaren/deepspeech.pytorch/)을 상당부분 참고한다.

### 구현 stack
OS : ubuntu 16.04 <br>
conda : 4.2.9 <br>


## 설치

오디오 I/O 를 위한 pytorch audio를 설치한다. <br>
```
sudo apt-get install sox libsox-dev libsox-fmt-all
git clone https://github.com/pytorch/audio.git
cd audio
python setup.py install
```

Requirments를 설치한다. <br>
```
sudo pip install python-levenshtein torch visdom wget librosa
```

## 데이터셋

데이터셋의 경우 카네기 멜론 대학교에서 제공한 free dataset인 **AN4** 를 사용한다.

```python
import argparse
import os
import io
import shutil
import tarfile
import wget

from utils import create_manifest

# command line에서 동작하는 것들을 더 쉽게 만들어주는 parser를 이용하여 데이터를 다운받는다.
parser = argparse.ArgumentParser(description='Processes and downloads an4.') 
parser.add_argument('--target_dir', default='an4_dataset/', help='Path to save dataset')
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
args = parser.parse_args()


def _format_data(root_path, data_tag, name, wav_folder):
    data_path = args.target_dir + data_tag + '/' + name + '/'
    new_transcript_path = data_path + '/txt/'
    new_wav_path = data_path + '/wav/'

    os.makedirs(new_transcript_path)
    os.makedirs(new_wav_path)

    wav_path = root_path + 'wav/'
    file_ids = root_path + 'etc/an4_%s.fileids' % data_tag
    transcripts = root_path + 'etc/an4_%s.transcription' % data_tag
    train_path = wav_path + wav_folder

    _convert_audio_to_wav(train_path)
    _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path)


def _convert_audio_to_wav(train_path):
    with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe:
        for line in pipe:
            raw_path = line.strip()
            new_path = line.replace('.raw', '.wav').strip()
            cmd = 'sox -t raw -r %d -b 16 -e signed-integer -B -c 1 \"%s\" \"%s\"' % (
                args.sample_rate, raw_path, new_path)
            os.system(cmd)


def _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path):
    with open(file_ids, 'r') as f:
        with open(transcripts, 'r') as t:
            paths = f.readlines()
            transcripts = t.readlines()
            for x in range(len(paths)):
                path = wav_path + paths[x].strip() + '.wav'
                filename = path.split('/')[-1]
                extracted_transcript = _process_transcript(transcripts, x)
                current_path = os.path.abspath(path)
                new_path = new_wav_path + filename
                text_path = new_transcript_path + filename.replace('.wav', '.txt')
                with io.FileIO(text_path, "w") as file:
                    file.write(extracted_transcript.encode('utf-8'))
                os.rename(current_path, new_path)


def _process_transcript(transcripts, x):
    extracted_transcript = transcripts[x].split('(')[0].strip("<s>").split('<')[0].strip().upper()
    return extracted_transcript


def main():
    root_path = 'an4/'
    name = 'an4'
    wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_data(root_path, 'train', name, 'an4_clstk')
    _format_data(root_path, 'test', name, 'an4test_clstk')
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    test_path = args.target_dir + '/test/'
    print ('\n', 'Creating manifests...')
    create_manifest(train_path, 'an4_train')
    create_manifest(test_path, 'an4_val')


if __name__ == '__main__':
    main()
```

## 1.1 Making audio spectogram using fourier transformation

In [3]:
import numpy as np
from matplotlib import pyplot as plt
import scipy.io.wavfile as wavb

from numpy.lib import stride_tricks

In [4]:
# audio signal의 fourier transformation
# hanning window 에 대한 ref : http://www.incosys.co.kr/index.php/book/1-basic-tech-vibration/2015-01-21-07-36-51-6/2015-01-21-07-36-51-6-1/2015-01-21-07-36-51-6-1-5
"""
short time fourier transform of audio signal 
"""
def stft(sig, frameSize, overlapFac=0.5, window=np.hanning):
    win = window(frameSize)
    hopSize = int(frameSize - np.floor(overlapFac * frameSize))
    
    # zeros at beginning (thus center of 1st window should be for sample nr. 0)
    samples = np.append(np.zeros(np.floor(frameSize/2.0)), sig)    
    # cols for windowing
    cols = np.ceil( (len(samples) - frameSize) / float(hopSize)) + 1
    # zeros at end (thus samples can be fully covered by frames)
    samples = np.append(samples, np.zeros(frameSize))
    
    frames = stride_tricks.as_strided(samples, shape=(cols, frameSize), strides=(samples.strides[0]*hopSize, samples.strides[0])).copy()
    frames *= win
    
    return np.fft.rfft(frames)    
    

## Dataset과 Dataloader 구현

pytorch의 dataset과 dataloader의 경우는 그 클래스가 정해져 있다. <br>

따라서 data loader와 dataset의 경우에는 사용자 정의 클래스를 만들기 위해서 **torch.utils.data**의 **Dataset**과 **DataLoader**를 상속받아야 한다. <br>

이를 통해 사용자 정의 dataset, dataloader를 구현할 것이며, <br>

본 dataset에는 audio spectogram 형태의 dataset이여야 하므로 *wav* 파일을 *spectogram* 형태로 바꾸는 함수를 집어넣고 구현한다., <br>

추가적인 dataset, dataloader 구현 튜토리얼은 [pytorch 공인 튜토리얼](http://pytorch.org/tutorials/beginner/data_loading_tutorial.html) 을 참고한다.

In [None]:
import librosa
import numpy as np
import scipy.signal
import torch
import torchaudio
from torch.utils.data import DataLoader
from torch.utils.data import Dataset


## 2. modeling


In [3]:
import math
from collections import OrderedDict
import torch
import torch.nn as nn
import torch.nn.functional as F

"""
RNN 모델상에서의 사용되는 4번째 layer RNN.
deep speech(2 아닌 1)논문에서는 bidirectional RNN을 사용한다.
"""
supported_rnns = {
    'lstm': nn.LSTM,
    'rnn': nn.RNN,
    'gru': nn.GRU
}
supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items()) #내가봤을 때 inverse는 딱히 필요없어.

In [7]:
"""
Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
Allows handling of variable sequence lengths and minibatch sizes.
:param module: Module to apply input to.
"""
"""
그니까 이게 minibatch를 하나의 string으로 만들어준다는 거지.
Q.  왜 굳이 그렇게하지? 
그냥 long sentence training을 위하여?
"""
class SequenceWise(nn.Module):
    def __init__(self, module):
        super(SequenceWise, self).__init__()
        self.module = module

    def forward(self, x):
        t, n = x.size(0), x.size(1)
        x = x.view(t * n, -1)
        x = self.module(x)
        x = x.view(t, n, -1)
        return x

    def __repr__(self):
        tmpstr = self.__class__.__name__ + ' (\n'
        tmpstr += self.module.__repr__()
        tmpstr += ')'
        return tmpstr
    
"""
이건 굉장히 typical 한 minibatch softmax.
각 data의 softmax를 torch.stack을 해야한다는거지.
Q. 근데 여기서 self.training은 어디서 오는거야
"""
class InferenceBatchSoftmax(nn.Module):
    def forward(self, input_):
        if not self.training:
            batch_size = input_.size()[0]
            return torch.stack([F.log_softmax(input_[i]) for i in range(batch_size)], 0)
        else:
            return input_
        
"""
batch구현
왜 batch normalization data를 sequencewise를 이 단에서 구현하느냐.
batch normalization에 대한 ref) https://shuuki4.wordpress.com/2016/01/13/batch-normalization-%EC%84%A4%EB%AA%85-%EB%B0%8F-%EA%B5%AC%ED%98%84/
이 sequencewise를 없애고 해보자 나중에.
"""
class BatchRNN(nn.Module):
    def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=False, batch_norm=True):
        super(BatchRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
        self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
                            bidirectional=bidirectional, bias=False)
        self.num_directions = 2 if bidirectional else 1

    # Q. 그니까 RNN 의 hidden layer를 output으로 내는데, bidirectional의 경우에는 이 둘을 그냥 더한다?
    def forward(self, x):
        if self.batch_norm is not None:
            x = self.batch_norm(x)
        x, _ = self.rnn(x)
        if self.bidirectional:
            x = x.view(x.size(0), x.size(1), 2, -1).sum(2).view(x.size(0), x.size(1), -1)  # (TxNxH*2) -> (TxNxH) by sum
            return x

In [None]:

"""
Q. labels의 역할은? 왜 여긴 abc밖에 없는거지?
Q. audio_conf는?
"""
class DeepSpeech(nn.Module):
    def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5, audio_conf=None,
                 bidirectional=True):
        super(DeepSpeech, self).__init__()

        # model metadata needed for serialization/deserialization
        if audio_conf is None:
            audio_conf = {}
        self._version = '0.0.1'
        self._hidden_size = rnn_hidden_size
        self._hidden_layers = nb_layers
        self._rnn_type = rnn_type
        self._audio_conf = audio_conf or {}
        self._labels = labels

        sample_rate = self._audio_conf.get("sample_rate", 16000)
        window_size = self._audio_conf.get("window_size", 0.02)
        num_classes = len(self._labels)

        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=(41, 11), stride=(2, 2)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True),
            nn.Conv2d(32, 32, kernel_size=(21, 11), stride=(2, 1)),
            nn.BatchNorm2d(32),
            nn.Hardtanh(0, 20, inplace=True)
        )
        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size - 41) / 2 + 1)
        rnn_input_size = int(math.floor(rnn_input_size - 21) / 2 + 1)
        rnn_input_size *= 32

        rnns = []
        rnn = BatchRNN(input_size=rnn_input_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
                       bidirectional=bidirectional, batch_norm=False)
        rnns.append(('0', rnn))
        for x in range(nb_layers - 1):
            rnn = BatchRNN(input_size=rnn_hidden_size, hidden_size=rnn_hidden_size, rnn_type=rnn_type,
                           bidirectional=bidirectional)
            rnns.append(('%d' % (x + 1), rnn))
        self.rnns = nn.Sequential(OrderedDict(rnns))
        fully_connected = nn.Sequential(
            nn.BatchNorm1d(rnn_hidden_size),
            nn.Linear(rnn_hidden_size, num_classes, bias=False)
        )
        self.fc = nn.Sequential(
            SequenceWise(fully_connected),
        )
        self.softmax = InferenceBatchSoftmax()

    def forward(self, x):
        x = self.conv(x)

        sizes = x.size()
        x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3])  # Collapse feature dimension
        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # TxNxH

        x = self.rnns(x)

        x = self.fc(x)
        x = x.transpose(0, 1)
        x = self.softmax(x)
        return x

    @classmethod
    def load_model(cls, path, cuda=False):
        package = torch.load(path, map_location=lambda storage, loc: storage)
        model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
                    labels=package['labels'], audio_conf=package['audio_conf'],
                    rnn_type=supported_rnns[package['rnn_type']])
        model.load_state_dict(package['state_dict'])
        if cuda:
            model = torch.nn.DataParallel(model).cuda()
        return model

    @staticmethod
    def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=None,
                  cer_results=None, wer_results=None, avg_loss=None, meta=None):
        model_is_cuda = next(model.parameters()).is_cuda
        model = model.module if model_is_cuda else model
        package = {
            'version': model._version,
            'hidden_size': model._hidden_size,
            'hidden_layers': model._hidden_layers,
            'rnn_type': supported_rnns_inv.get(model._rnn_type, model._rnn_type.__name__.lower()),
            'audio_conf': model._audio_conf,
            'labels': model._labels,
            'state_dict': model.state_dict()
        }
        if optimizer is not None:
            package['optim_dict'] = optimizer.state_dict()
        if avg_loss is not None:
            package['avg_loss'] = avg_loss
        if epoch is not None:
            package['epoch'] = epoch + 1  # increment for readability
        if iteration is not None:
            package['iteration'] = iteration
        if loss_results is not None:
            package['loss_results'] = loss_results
            package['cer_results'] = cer_results
            package['wer_results'] = wer_results
        if meta is not None:
            package['meta'] = meta
        return package

    @staticmethod
    def get_labels(model):
        model_is_cuda = next(model.parameters()).is_cuda
        return model.module._labels if model_is_cuda else model._labels

    @staticmethod
    def get_param_size(model):
        params = 0
        for p in model.parameters():
            tmp = 1
            for x in p.size():
                tmp *= x
            params += tmp
        return params

    @staticmethod
    def get_audio_conf(model):
        model_is_cuda = next(model.parameters()).is_cuda
        return model.module._audio_conf if model_is_cuda else model._audio_conf



In [2]:
len("abc")

3