# DEEP SPEECH1 구현 - pytorch

CTC loss 를 이용한 deep speech 구현이다. <br>
구현 모델은 [Deep speech 이론 by YBigTa](https://github.com/YBIGTA/Deep_learning/blob/master/Deep%20speech_%EC%83%81%ED%97%8C.pdf) 에서 확인 가능하다.

### 구현 stack
OS : ubuntu 16.04 <br>
conda : 4.2.9 <br>


## 설치

오디오 I/O 를 위한 pytorch audio를 설치한다. <br>
```
sudo apt-get install sox libsox-dev libsox-fmt-all
git clone https://github.com/pytorch/audio.git
cd audio
python setup.py install
```

Requirments를 설치한다. <br>
```
sudo pip install python-levenshtein torch visdom wget librosa
```

## 데이터셋

데이터셋의 경우 카네기 멜론 대학교에서 제공한 free dataset인 **AN4** 를 사용한다.

```python
import argparse
import os
import io
import shutil
import tarfile
import wget

from utils import create_manifest

# command line에서 동작하는 것들을 더 쉽게 만들어주는 parser를 이용하여 데이터를 다운받는다.
parser = argparse.ArgumentParser(description='Processes and downloads an4.') 
parser.add_argument('--target_dir', default='an4_dataset/', help='Path to save dataset')
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
args = parser.parse_args()


def _format_data(root_path, data_tag, name, wav_folder):
    data_path = args.target_dir + data_tag + '/' + name + '/'
    new_transcript_path = data_path + '/txt/'
    new_wav_path = data_path + '/wav/'

    os.makedirs(new_transcript_path)
    os.makedirs(new_wav_path)

    wav_path = root_path + 'wav/'
    file_ids = root_path + 'etc/an4_%s.fileids' % data_tag
    transcripts = root_path + 'etc/an4_%s.transcription' % data_tag
    train_path = wav_path + wav_folder

    _convert_audio_to_wav(train_path)
    _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path)


def _convert_audio_to_wav(train_path):
    with os.popen('find %s -type f -name "*.raw"' % train_path) as pipe:
        for line in pipe:
            raw_path = line.strip()
            new_path = line.replace('.raw', '.wav').strip()
            cmd = 'sox -t raw -r %d -b 16 -e signed-integer -B -c 1 \"%s\" \"%s\"' % (
                args.sample_rate, raw_path, new_path)
            os.system(cmd)


def _format_files(file_ids, new_transcript_path, new_wav_path, transcripts, wav_path):
    with open(file_ids, 'r') as f:
        with open(transcripts, 'r') as t:
            paths = f.readlines()
            transcripts = t.readlines()
            for x in range(len(paths)):
                path = wav_path + paths[x].strip() + '.wav'
                filename = path.split('/')[-1]
                extracted_transcript = _process_transcript(transcripts, x)
                current_path = os.path.abspath(path)
                new_path = new_wav_path + filename
                text_path = new_transcript_path + filename.replace('.wav', '.txt')
                with io.FileIO(text_path, "w") as file:
                    file.write(extracted_transcript.encode('utf-8'))
                os.rename(current_path, new_path)


def _process_transcript(transcripts, x):
    extracted_transcript = transcripts[x].split('(')[0].strip("<s>").split('<')[0].strip().upper()
    return extracted_transcript


def main():
    root_path = 'an4/'
    name = 'an4'
    wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz')
    tar = tarfile.open('an4_raw.bigendian.tar.gz')
    tar.extractall()
    os.makedirs(args.target_dir)
    _format_data(root_path, 'train', name, 'an4_clstk')
    _format_data(root_path, 'test', name, 'an4test_clstk')
    shutil.rmtree(root_path)
    os.remove('an4_raw.bigendian.tar.gz')
    train_path = args.target_dir + '/train/'
    test_path = args.target_dir + '/test/'
    print ('\n', 'Creating manifests...')
    create_manifest(train_path, 'an4_train')
    create_manifest(test_path, 'an4_val')


if __name__ == '__main__':
    main()
```