# 1. Check environment

In [None]:
# import sys
# sys.executable should point to your virtualenv's Python interpreter.
# sys.path should include your virtualenv's site-packages directory.
# print(sys.executable)
# print(sys.path)
# !echo ""
import os
# update PYTHONPATH and PATH
os.environ["PYTHONPATH"] = "/pfs/data5/home/kit/stud/unyfv/myEnv/lib/python3.9/site-packages:" + os.environ.get("PYTHONPATH", "")
os.environ["PATH"] = "/pfs/data5/home/kit/stud/unyfv/myEnv/bin:" + os.environ["PATH"]
# # check
# !which python
# !which pip
# !echo ""
!echo $PYTHONPATH
os.environ['PYTHONPATH'] += ":/pfs/data5/home/kit/stud/u/fairseq/"
!echo $PYTHONPATH

# 2. Data Preparation

## 2.1 Let's fetch some sample data.

In [None]:
from pathlib import Path
root_dir = Path("/pfs/data5/home/kit/stud/unyfv/cv-corpus-19.0-2024-09-13/de")

In [None]:
import numpy as np
from torchaudio.datasets import COMMONVOICE
# If you got 'an undefined symbol' error, check your torch_version.
# my_torch_version == 2.1.0 and my_torchaudio_version == 2.1.0

train_data = COMMONVOICE(root_dir, tsv="train.tsv") # len = 595998
dev_data = COMMONVOICE(root_dir, tsv="dev.tsv") # len = 16188
test_data = COMMONVOICE(root_dir, tsv="test.tsv") # len = 16188

# Sneak peek at the data:
print(train_data[0])
# If you got 'Couldn't find appropriate backend to handle uri *** and format None' error
# Unfortunately I forgot how to fix this error... xd
# But I know what I tried:
# pip install ffmpeg pysoundfile librosa soundfile
# Install them all and restart your kernel... Wooow, amazing!

## 2.2 Extract log mel filter bank features.

In [None]:
from tqdm import tqdm
from fairseq.examples.speech_to_text.data_utils import (
    create_zip,
    extract_fbank_features,
    gen_config_yaml,
    gen_vocab,
    get_zip_manifest,
    save_df_to_tsv,
)

# Define file path to store the features
feature_root = root_dir / "fbank80"
feature_root.mkdir(exist_ok=True)

# Extract features of all audios in all data splits
for dataset in [train_data, dev_data, test_data]:
    print(f"Extracting log mel filter bank features of audio")
    for wav, sample_rate, metadata in tqdm(dataset):
        sample_id = f"{metadata['client_id']}-{metadata['sentence_id']}"
        extract_fbank_features(
                wav, sample_rate, feature_root / f"{sample_id}.npy"
        )

# Pack audio features into ZIP
zip_path = root_dir / "fbank80.zip"
print("ZIPing features...")
create_zip(feature_root, zip_path) # len = 628374

## 2.3 Generating TSV (Tab Separated Values) manifest files, containing samples' metadata.

In [None]:
print("Fetching audio manifest...")
audio_paths, audio_lengths = get_zip_manifest(zip_path)

import pickle
with open(root_dir / 'audio_manifest.pkl', 'wb') as f:
    pickle.dump((audio_paths, audio_lengths), f)
print("Saved as audio_manifest.pkl")

In [None]:
import pickle
import pandas as pd

with open(root_dir / 'audio_manifest.pkl', 'rb') as file:
    audio_paths, audio_lengths = pickle.load(file)

MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"]
# for dataset, split_name in zip([train_data, dev_data, test_data],["train-clean", "dev-clean", "test-clean"]):
for dataset, split_name in zip([dev_data, test_data],["dev-clean", "test-clean"]):
    print(f"Fetching manifest from {split_name}...")
    manifest = {c: [] for c in MANIFEST_COLUMNS}
    for _, _, metadata in tqdm(dataset):
        sample_id = f"{metadata['client_id']}-{metadata['sentence_id']}"
        manifest["id"].append(sample_id)
        manifest["audio"].append(audio_paths[sample_id])
        manifest["n_frames"].append(audio_lengths[sample_id])
        manifest["tgt_text"].append(metadata['sentence'])
        manifest["speaker"].append(metadata['client_id'])
    
    save_df_to_tsv(pd.DataFrame.from_dict(manifest), root_dir / f"{split_name}.tsv")

## 2.4 For text data, we use the [sentencepiece](https://github.com/google/sentencepiece) package to train a subwords segmentation model and generate the subword vocabulary (same as before).

In [None]:
# Collect train text to generate sentencepiece model and vocabulary later on
train_text = pd.read_csv(root_dir / "train-clean.tsv", sep='\t')["tgt_text"].tolist()
with open(root_dir / 'train_text.txt', 'w') as f:
    for t in train_text:
        f.write(t + "\n")

# Train sentencepiece model and generate subword vocabulary
# The vocab size depends on your dataset size.
VOCAB_SIZE = 50000
gen_vocab(
    Path(root_dir / 'train_text.txt'),
    root_dir / f"spm_unigram{VOCAB_SIZE}",
    model_type='unigram',
    vocab_size=VOCAB_SIZE,
)

## 2.5 Generate config YAML file to be used for the training script

In [None]:
gen_config_yaml(
    root_dir,
    spm_filename=f"spm_unigram{VOCAB_SIZE}.model"
)
# This file is config.yaml in root_dir.

# 3. Training

In [None]:
# In the left menu bar 'softwares' load cudnn/9.2 if you meet an error about cudnn.
!CUDA_VISIBLE_DEVICES=0 fairseq-train "/pfs/data5/home/kit/stud/unyfv/cv-corpus-19.0-2024-09-13/de" \
  --save-dir "/pfs/data5/home/kit/stud/unyfv/cv-corpus-19.0-2024-09-13/de/models" \
  --train-subset train-clean --valid-subset dev-clean \
  --num-workers 8 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8

# 4. Inference & Evaluation

Now we can generate transcription with the trained model. Notice that we set the `scoring` parameter to be `wer`. WER, i.e., **Word Error Rate**, is the metrics that we use to evaluate our English ASR model. WER is defined as the number of errornously transcribed words divided by the total number of words in the reference sentence.

In [None]:
# !pip install editdistance
PRED_OUTPUT_DIR = "/pfs/data5/home/kit/stud/unyfv/cv-corpus-19.0-2024-09-13/de/pred_eval"
PRED_LOG = f"{PRED_OUTPUT_DIR}/en_s2t.pred.log"
!mkdir $PRED_OUTPUT_DIR
!fairseq-generate "/pfs/data5/home/kit/stud/unyfv/cv-corpus-19.0-2024-09-13/de" \
    --config-yaml config.yaml --gen-subset test-clean \
    --task speech_to_text \
    --path "/pfs/data5/home/kit/stud/unyfv/cv-corpus-19.0-2024-09-13/de/models/checkpoint_best.pt" \
    --max-tokens 50000 --beam 5 --scoring wer > $PRED_LOG

In [None]:
!grep ^D $PRED_LOG | sed 's/^D-//g' | cut -f 3 | sed 's/ ##//g' > $PRED_OUTPUT_DIR/hyp.txt
!grep ^T $PRED_LOG | sed 's/^T-//g' | cut -f 2 | sed 's/ ##//g' > $PRED_OUTPUT_DIR/ref.txt
!head $PRED_OUTPUT_DIR/hyp.txt
!echo ""
!head $PRED_OUTPUT_DIR/ref.txt
!echo ""
!tail -n 1 $PRED_LOG