In [1]:
import argparse
import os
import sys
from typing import List

import torch

sys.path.append("../")
from datasets import Dataset, load_dataset
from speechline.transcribers import Wav2Vec2Transcriber

torch_dtype = torch.bfloat16
transcriber = Wav2Vec2Transcriber("bookbot/w2v-bert-2.0-bb-libri-cv-giga-dean2zak", None)

dataset = load_dataset("bookbot/bookbot_en_v3_parakeet-ctc-1.1b_filtered", "default", split="train", num_proc=os.cpu_count())

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading dataset shards:   0%|          | 0/22 [00:00<?, ?it/s]

In [24]:
from gruut import sentences
def gruut_g2p(text: str):
        phonemes = []
        for words in sentences(text, lang="EN"):
            for word in words:
                if word.is_major_break or word.is_minor_break:
                    phonemes.append(word.text)
                elif word.phonemes:
                    phonemes.append(" ".join(word.phonemes))
        return "".join(phonemes)
    
print(gruut_g2p("Hello, World!"))

from g2p_id import G2p
def g2p_id(text: str):
    g2p = G2p()
    results = g2p(text)
    results = [phoneme for word in results for phoneme in word ]
    return " ".join(results)
print(g2p_id("Hello"))


h ɛ l ˈoʊ,w ˈɚ l d!
h e l l o


In [2]:
from azure.cosmos import CosmosClient
import os

COSMOS_DB_KEY = os.getenv('COSMOS_DB_KEY')
COSMOS_URL = "https://bookbot.documents.azure.com:443/"


def get_cosmos_client(url, key, database_name):
    """Initialize and return a CosmosDB client."""
    client = CosmosClient(url, credential=key)
    database = client.get_database_client(database_name)
    word_container = database.get_container_client("WordUniversal")
    return word_container

def get_lexicon(word_container, language_code):
    """Retrieve the lexicon for a specific language from CosmosDB."""
    query = f'SELECT * FROM c WHERE c.language = "{language_code}" and not is_defined(c.deletedAt)'
    query_iterable = word_container.query_items(
        query=query,
        partition_key="default",
        max_item_count=10000,
    )
    lexicon = {}
    for item in query_iterable:
        if "lexicons" in item:
            lexicon[item["word"]] = set(item["lexicons"])
    return lexicon


cosmos_client = get_cosmos_client(COSMOS_URL, COSMOS_DB_KEY, "Bookbot")
# cosmos_lexicon = get_lexicon(cosmos_client, "en")

In [3]:
from speechline.segmenters import PhonemeOverlapSegmenter
from lexikos import Lexicon as Lexicos
from g2p_id import G2p

class Lexicon(PhonemeOverlapSegmenter):
    def __init__(self, language):
        self.language = language
        self.cosmos_lexicon = get_lexicon(cosmos_client, language)
        self._init_g2p(language)   
        
        # If language is english, use Lexicos
        if language == "en":
            lexicos_lexicon = Lexicos()
            for k, v in lexicos_lexicon.items():
                self.cosmos_lexicon[k] = self.cosmos_lexicon[k].union(set(v)) if k in self.cosmos_lexicon else set(v)
        super().__init__(self.cosmos_lexicon)
        
    def gruut_g2p(self, text: str) -> List[str]:
        phonemes = []
        for words in sentences(text, lang=self.language):
            for word in words:
                if word.is_major_break or word.is_minor_break:
                    phonemes.append(word.text)
                elif word.phonemes:
                    phonemes.append(" ".join(word.phonemes))
        return phonemes
    
    def g2p_id(self, text: str) -> List[str]:
        g2p = G2p()
        return " ".join(g2p(text)[0])
    
    def _init_g2p(self, language):
        if language == "en":
            self.g2p = self.gruut_g2p
        elif language == "id":
            self.g2p = self.g2p_id
        elif language == "sw":
            self.g2p = self.gruut_g2p
        
    def _normalize_text(self, text: str) -> str:
        text = text.lower().strip()
        return text  
    
    def _generate_combinations(self, ground_truth: List[str]) -> List[List[str]]:
        """
        Generate all possible phoneme combinations for a given word.

        Args:
            ground_truth (List[str]):
                List of words.

        Returns:
            List[List[str]]:
                List of phoneme combinations.
        """
        combinations = []
        for word in ground_truth:
            normalized_word = self._normalize_text(word)
            if normalized_word in self.lexicon:
                phonemes = self.lexicon[normalized_word]
            else:
                phonemes = self.g2p(normalized_word)
            combinations.append(phonemes)
        return combinations
    
lexicon = Lexicon("en")
# ground_truth = ["Hello", "World"]
# ground_truth = lexicon._generate_combinations(ground_truth)


In [4]:
from datasets import Dataset
from speechline.utils.tokenizer import WordTokenizer

filtered_dataset = {"audio": [], "transcript": [], "text": [], "language": [], "speaker": []}
dataset = dataset.select(range(100))

tokenizer = WordTokenizer()

def check_phoneme_match(phoneme_transcript, ground_truth):
    """
    Check if each phoneme in transcript exists in corresponding ground truth set
    
    Args:
        phoneme_transcript (List[str]): List of phonemes from transcript
        ground_truth (List[Set[str]]): List of sets containing valid phonemes
    
    Returns:
        bool: True if all phonemes match their ground truth sets
    Example:
        phoneme_transcript = ['ɪn', 'ðɛɹ', 'deɪ']
        ground_truth = [{'ɪ n', 'ɪ ŋ'}, {'ð ɛ ɹ', 'ð ɛ r'}, {'d e ɪ', 'd e ɪ'}]
    """
    # Check lengths match first
    if len(phoneme_transcript) != len(ground_truth):
        return False
        
    # Check each phoneme against its ground truth set
    for phoneme, valid_phonemes in zip(phoneme_transcript, ground_truth):
        # Remove spaces from transcript phoneme for comparison
        valid_phonemes_no_spaces = {p.replace(" ", "") for p in valid_phonemes}
        
        if phoneme not in valid_phonemes_no_spaces:
            return False
            
    return True


for datum in dataset:
    audio_data = datum["audio"].copy()
    result = transcriber.pipeline(audio_data, chunk_length_s=30)
    phoneme_transcript = result["text"]
    list_phoneme_transcript = phoneme_transcript.split()
    
    # Get ground all lexicon combinations for each word from the ground_truth text
    list_ground_truth_phonemes = lexicon._generate_combinations(tokenizer(datum["text"]))

    # print(f"Phoneme Transcript: {phoneme_transcript}")
    # print(f"List Phoneme transcript: {list_phoneme_transcript}")
    # print(f"Ground truth text: {datum['text']}")
    # print(f"Ground truth: {list_ground_truth_phonemes}")
    
    if not check_phoneme_match(list_phoneme_transcript, list_ground_truth_phonemes):
        continue
    filtered_dataset["audio"].append(datum["audio"])
    filtered_dataset["transcript"].append(phoneme_transcript)
    filtered_dataset["text"].append(datum["text"])
    filtered_dataset["language"].append(datum["language"])
    filtered_dataset["speaker"].append(datum["speaker"])
    
filtered_dataset = Dataset.from_dict(filtered_dataset)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [17]:
filtered_dataset.push_to_hub("bookbot/bookbot_en_v3_parakeet-ctc-1.1b_filtered_phoneme")