## Environment setup

In this section we install all the system and Python libraries necessary for the project.


In [None]:
%%sh
python3 -V
pip3 -V

In [None]:
%cd /

In [None]:
%%sh
apt-get update && apt-get upgrade -y
apt-get update && apt-get install -y libsndfile1

In [None]:
%%sh
# KenLM
apt-get update && apt-get install -y libboost-all-dev
git clone --depth 1  https://github.com/kpu/kenlm.git
mkdir -p kenlm/build \
  && cd kenlm/build \
  && cmake .. \
  && make -j $(nproc)

In [None]:
%%sh
# PocoLM
apt-get update && apt-get install -y subversion
git clone --depth 1 https://github.com/danpovey/pocolm.git
cd pocolm/ \
  && make -j $(nproc)

In [None]:
%%sh
# Use some tools from DeepSpeech project
git clone --depth 1 https://github.com/mozilla/DeepSpeech.git
# CTC decoder (the next line is required for building with shallow git clone)
sed -i 's/git describe --long --tags/git describe --long --tags --always/g' /DeepSpeech/native_client/bazel_workspace_status_cmd.sh
apt-get update && apt-get install -y libmagic-dev
cd /DeepSpeech/native_client/ctcdecode && make NUM_PROCESSES=$(nproc) bindings
pip3 install --upgrade /DeepSpeech/native_client/ctcdecode/dist/*.whl

In [None]:
%%sh
# Get prebuilt scorer generator script
cd /DeepSpeech/data/lm/ \
  && curl -LO https://github.com/mozilla/DeepSpeech/releases/latest/download/native_client.amd64.cpu.linux.tar.xz \
  && tar xvf native_client.*.tar.xz

In [None]:
%%sh
# Solve broken pip "ImportError: No module named pip._internal.cli.main"
python3 -m pip install --upgrade pip

In [None]:
%%sh
# Dependencies for noise normalization
apt-get update && apt-get install -y ffmpeg
pip install --no-cache-dir --upgrade pydub librosa

In [None]:
%%sh
# Pre-install some libraries for faster installation time of training package
pip3 install --no-cache-dir pandas
pip3 install --no-cache-dir "tensorflow<2.5,>=2.4"
pip3 install --no-cache-dir "tensorflow-addons<0.13"
pip3 install --no-cache-dir "tensorflow-io<0.18"

In [None]:
%%sh
# Install audiomate, with some fixes
apt-get update && apt-get install -y sox libsox-fmt-mp3
pip3 install --no-cache-dir audiomate
sed -i 's/from down import Downloader/from pget.down import Downloader/g' /usr/local/lib/python3.7/dist-packages/pget/__init__.py
sed -i 's/print "Resume is not applicable at this stage."/print("Resume is not applicable at this stage.")/g' /usr/local/lib/python3.7/dist-packages/pget/down.py

In [None]:
%%sh
# Training profiler
pip3 install --upgrade --no-cache-dir tensorboard-plugin-profile

# Dependency to draw graph images
apt-get update && apt-get install -y graphviz

# TfLite runtime
pip3 install --no-cache-dir --extra-index-url https://google-coral.github.io/py-repo/ tflite_runtime

# Install corcua
git clone --depth 1 https://gitlab.com/Jaco-Assistant/corcua.git
pip3 install --no-cache-dir -e corcua/

In [None]:
%%sh
git clone https://gitlab.com/Jaco-Assistant/Scribosermo
cp -r /Scribosermo/training/ /training/
pip3 install --no-cache-dir -e /Scribosermo/training/

**Restart you runtime**

## Download the datasets, pre-trained models, and language model

In [None]:
%cd /

File structure should look like this:

```text
my_speech2text_folder
    checkpoints
    corcua
    data_original
    data_prepared
    Scribosermo
```

In [None]:
!mkdir checkpoints data_original data_prepared
!mkdir /data_prepared/langmodel/

### Language model

Download a pre-trained language model for output performance enhancement

Go to https://www.mediafire.com/file/pzj8prgv2h0c8ue/kenlm_de_all.scorer/file and copy the download link

In [None]:
# !wget <generated_download_link>
!wget https://download1076.mediafire.com/cjxjvjk3plvg/pzj8prgv2h0c8ue/kenlm_de_all.scorer

In [None]:
!mv /kenlm_de_all.scorer /data_prepared/langmodel/de.scorer

### Pre-trained model

go to https://www.mediafire.com/folder/jh5unptizgzou/d37cv-wer0066
then clik on `pb.zip` and copy the generated download link

In [None]:
# !wget <generated_download_link>
!wget --backups=1 https://download1582.mediafire.com/mw2rjh12qsmg/udh6fnf3lcpbt71/pb.zip

go to https://www.mediafire.com/folder/jh5unptizgzou/d37cv-wer0066
then click on `config_export.json` and copy the generated download link

In [None]:
!wget --backups=1 https://download1590.mediafire.com/lkwo10fv222g/u05b4i8z6b940kc/config_export.json

In [None]:
# move the model and config file to the required directories
!mkdir -p /checkpoints/de/cvd37/
!mkdir -p /checkpoints/de/cvd37-2/

!unzip /pb.zip -d /checkpoints/de/cvd37/
!mv /checkpoints/de/cvd37/pb/* /checkpoints/de/cvd37/
!rm -r /checkpoints/de/cvd37/pb
!cp -r /checkpoints/de/cvd37/* /checkpoints/de/cvd37-2/

### Fix model save

Save the model's weights instead of the entire model. This is to circumvent a problem when loading the model.

In [None]:
import tensorflow as tf


with tf.device("CPU"):
    model = tf.keras.models.load_model("/checkpoints/de/cvd37-2/")
    model.save_weights("/checkpoints/de/cvd37-2/")
    model.save_weights("/checkpoints/de/cvd37/")

### Datasets

**Restart your runtime before proceeding**

Download the dataset and preprocess it, convert all audio files so that they have 16K sampling rate

In [None]:
from corcua import downloaders, readers, writers


downloaders.voxforge.Downloader().download_dataset(path="/data_original/de/voxforge/", overwrite=True, args={"language": "de"})
ds = readers.voxforge.Reader().load_dataset({"path": "/data_original/de/voxforge/"})
writers.base_writer.Writer().save_dataset(ds, path="/data_prepared/de/voxforge/", sample_rate=16_000, overwrite=True)

Split the data into train, development and test tests (80%, 10%, 10%, respectively, here. You can change the percentages as you wish.)

In [None]:
!python3 /Scribosermo/preprocessing/split_dataset.py /data_prepared/de/voxforge/all.csv --split '80|10|10' --file_appendix _s

In [None]:
# remove very short, very long and quick samples

!python3 /Scribosermo/preprocessing/dataset_operations.py \
"/data_prepared/de/voxforge/train_s.csv" \
"/data_prepared/de/voxforge/train_clean.csv" --replace --exclude --clean

!python3 /Scribosermo/preprocessing/dataset_operations.py \
"/data_prepared/de/voxforge/dev_s.csv" \
"/data_prepared/de/voxforge/dev_clean.csv" --replace --exclude --clean

# don't clean test
!python3 /Scribosermo/preprocessing/dataset_operations.py \
"/data_prepared/de/voxforge/test_s.csv" \
"/data_prepared/de/voxforge/test_clean.csv" --replace --exclude

## Create training config

The `config` values control the training process.

Here you can set the batch size, the learning rate, and the number of epochs

In [None]:
import json
import yaml


with open("/config_export.json", "r") as f:
    config = json.load(f)

config["data_paths"] = {
    "eval": "/data_prepared/de/voxforge/dev_clean.csv",
    "test": "/data_prepared/de/voxforge/test_clean.csv",
    "train": "/data_prepared/de/voxforge/train_clean.csv"
}
config["scorer"]["path"] = "/data_prepared/langmodel/de.scorer"

Print the `config` dict to view all other available options

In [None]:
config["batch_sizes"] = {
    "train": 32,
    "eval": 32,
    "test": 1
}
config["optimizer"]["learning_rate"] = 1e-5
config["training_epochs"] = 200
config["freeze_base_net"] = False

In [None]:
# save the config file
with open("/config_export_modified.json", "w") as f:
    json.dump(config, f)

with open("/Scribosermo/training/config/train_config.yaml", "w") as f:
    yaml.dump(config, f, default_flow_style=False)

!cp /config_export_modified.json /checkpoints/de/cvd37-2/config_export.json
!cp /config_export_modified.json /checkpoints/de/cvd37/config_export.json

## Train

In [None]:
# run to start training
!python3 /Scribosermo/training/run_train.py