In [12]:
!pip install librosa

Defaulting to user installation because normal site-packages is not writeable
Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Using cached pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Using cached librosa-0.10.2.post1-py3-none-any.whl (260 kB)
Using cached audioread-3.0.1-py3-none-any.whl (23 kB)
Using cached pooch-1.8.2-py3-none-any.whl (64 kB)
Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.0 MB 1.3 MB/s eta 0:00:01
   -- --------------

In [10]:
import os
import yaml


class BaseModelConfigs:
    def __init__(self):
        self.model_path = None

    def serialize(self):
        class_attributes = {key: value
                            for (key, value)
                            in type(self).__dict__.items()
                            if key not in ['__module__', '__init__', '__doc__', '__annotations__']}
        instance_attributes = self.__dict__

        # first init with class attributes then apply instance attributes overwriting any existing duplicate attributes
        all_attributes = class_attributes.copy()
        all_attributes.update(instance_attributes)

        return all_attributes

    def save(self, name: str = "configs.yaml"):
        if self.model_path is None:
            raise Exception("Model path is not specified")

        # create directory if not exist
        if not os.path.exists(self.model_path):
            os.makedirs(self.model_path)

        with open(os.path.join(self.model_path, name), "w") as f:
            yaml.dump(self.serialize(), f)

    @staticmethod
    def load(configs_path: str):
        with open(configs_path, "r") as f:
            configs = yaml.load(f, Loader=yaml.FullLoader)

        config = BaseModelConfigs()
        for key, value in configs.items():
            setattr(config, key, value)

        return config

In [11]:
from datetime import datetime
class ModelConfigs(BaseModelConfigs):
    def __init__(self):
        super().__init__()
        self.model_path = os.path.join("Models/05_sound_to_text", datetime.strftime(datetime.now(), "%Y%m%d%H%M"))
        self.frame_length = 256 
        self.frame_step = 160
        self.fft_length = 384

        self.vocab = "abcdefghijklmnopqrstuvwxyz'?! "
        self.input_shape = None
        self.max_text_length = None
        self.max_spectrogram_length = None

        self.batch_size = 8
        self.learning_rate = 0.0005
        self.train_epochs = 1000
        self.train_workers = 20

In [14]:
import typing
import importlib
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import logging

def import_librosa(object) -> None:
    """Import librosa using importlib"""
    try:
        version = object.librosa.__version__
    except:
        version = "librosa version not found"
        try:
            object.librosa = importlib.import_module('librosa')
            print("librosa version:", object.librosa.__version__)
        except:
            raise ImportError("librosa is required to augment Audio. Please install it with `pip install librosa`.")
        
class WavReader:
    def __init__(self, frame_length, frame_step, fft_length,*args, **kwargs):
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        matplotlib.interactive(False)
        import_librosa(self)
    @staticmethod
    def get_spectrogram(wav_path: str, frame_length: int, frame_step: int, fft_length: int) -> np.ndarray:
        # audio, sr = librosa.load(file_path, sr=16000)
        # spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=self.fft_length, hop_length=self.frame_step, win_length=self.frame_length)
        # return spectrogram
        import_librosa(WavReader)
        audio,orig=WavReader.librosa.load(wav_path)
        spectrogram = WavReader.librosa.stft(audio,hop_length = frame_step,win_length=frame_length )
        spectrogram = np.power(np.abs(spectrogram), 0.5)
        spectrogram = (spectrogram - np.mean(spectrogram)) / (np.std(spectrogram) + 1e-10)

        return spectrogram
    
    @staticmethod
    def plot_audio(wav_path: str, title: str = None, sr: int = 16000) -> None:
        import_librosa(WavReader)
        audio, orig_sr = WavReader.librosa.load(wav_path, sr=sr)

        duration = len(audio) / orig_sr

        time = np.linspace(0, duration, num=len(audio))

        plt.figure(figsize=(15, 5))
        plt.plot(time, audio)
        plt.title(title) if title else plt.title("Audio Plot")
        plt.ylabel("signal wave")
        plt.xlabel("time (s)")
        plt.tight_layout()
        plt.show()

    @staticmethod
    def plot_spectrogram(spectrogram: np.ndarray, title:str = "", transpose: bool = True, invert: bool = True) -> None:
        if transpose:
            spectrogram = spectrogram.T
        
        if invert:
            spectrogram = spectrogram[::-1]

        plt.figure(figsize=(15, 5))
        plt.imshow(spectrogram, aspect="auto", origin="lower")
        plt.title(f"Spectrogram: {title}")
        plt.xlabel("Time")
        plt.ylabel("Frequency")
        plt.colorbar()
        plt.tight_layout()
        plt.show()



    def __call__(self, audio_path: str, label: typing.Any):
        """
        Extract the spectrogram and label of a WAV file.

        Args:
            audio_path (str): Path to the WAV file.
            label (typing.Any): Label of the WAV file.

        Returns:
            Tuple[np.ndarray, typing.Any]: Spectrogram of the WAV file and its label.
        """
        return self.get_spectrogram(audio_path, self.frame_length, self.frame_step, self.fft_length), label



TENSORFLOW SPECIFIC FUNCTIONS

In [23]:
!pip install onnx

Defaulting to user installation because normal site-packages is not writeable


In [29]:
import tensorflow as tf
from keras.callbacks import Callback
import tf2onnx
import logging
import onnx
class CTCloss(tf.keras.losses.Loss):
    def __init__(self,name:str="CTCloss"):
        super(CTCloss,self).__init__()
        self.name = name
        self.loss_fn = tf.keras.backend.ctc_batch_cost
    def __call__(self,y_true:tf.Tensor,y_pred:tf.Tensor,sample_weight=None)->tf.Tensor:
        batch_len = tf.cast(tf.shape(y_true)[0],dtype = "int64")
        label_len = tf.cast(tf.shape(y_true)[1], dtype="int64")*tf.ones(shape=(batch_len,1),dtype="int64")
        input_len= tf.cast(tf.shape(y_pred)[1], dtype="int64")*tf.ones(shape=(batch_len,1),dtype="int64")
        loss = self.loss_fn(y_true,y_pred,input_len,label_len)
        return loss
    
class Model2Onnx(Callback):
    def __init__(self,saved_model_path:str,metadata:dict=None,save_on_epoch_end:bool=False)->None:
        super().__init__()
        self.saved_model_path = saved_model_path
        self.metadata = metadata
        self.save_on_epoch_end = save_on_epoch_end

    @staticmethod
    def model2onnx(model:tf.keras.Model,onnx_model_path:str):
        tf2onnx.convert.from_keras(model,output_path=onnx_model_path)

    @staticmethod
    def add_metadata(onnx_model_path:str,metadata:dict=None):
        onnx_model  = onnx.load(onnx_model_path)
        for key,val in metadata.items():
            meta = onnx_model.metadata_props.add()
            meta.key = key
            meta.value = str(val)
        onnx.save(onnx_model,onnx_model_path)


    def epoch_end(self, epoch: int, logs: dict=None):
        """ Converts the model to onnx format on every epoch end. """
        if self.save_on_epoch_end:
            self.on_train_end(logs=logs)

    def train_end(self, logs=None):
        """ Converts the model to onnx format after training is finished. """
        self.model.load_weights(self.saved_model_path)
        onnx_model_path = self.saved_model_path.replace(".h5", ".onnx")
        self.model2onnx(self.model, onnx_model_path)
        self.include_metadata(onnx_model_path, self.metadata)

class TrainLogger(Callback):
    def __init__(self,log_path:str,log_level:int=logging.INFO,console_output:bool=False)->None:
        super().__init__()
        self.log_path = log_path
        self.log_level = log_level
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(self.log_level)
        self.file_handler.setFormatter(self.formatter)

        if not console_output:
            self.logger.handlers[:] = []
        self.logger.addHandler(logging.FileHandler(self.log_path))

    def on_epoch_end(self, epoch: int, logs: dict=None):
        epoch_message = f"Epoch {epoch}; "
        logs_message = "; ".join([f"{key}: {value}" for key, value in logs.items()])
        self.logger.info(epoch_message + logs_message)






In [None]:
class Transformer:
    def __init__(self, log_level: int = logging.INFO) -> None:
        self._log_level = log_level

        self.logger = logging.getLogger(self.__class__.__name__)
        self.logger.setLevel(logging.INFO)

    def __call__(self, data: typing.Any, label: typing.Any, *args, **kwargs):
        raise NotImplementedError
class LabelTransformer(Transformer):
    def __init__(self, vocab: typing.List[str]):
        self.vocab = vocab
    def __call__(self, data: np.array, label: np.array):
        return data, np.array([self.vocab.index(l) for l in label if l in self.vocab])

class LabelPadding(Transformer):
    def __init__(self, padding_value: int,
        max_word_length: int = None, 
        use_on_batch: bool = False):
        self.max_text_length = max_word_length
        self.padding_value = padding_value
        self.use_on_batch = use_on_batch

        if not use_on_batch and max_word_length is None:
            raise ValueError("max_word_length must be specified if use_on_batch is False")

    def __call__(self, data: typing.Any, label: typing.Any, *args, **kwargs):

        if self.use_on_batch:
            max_len = max([len(a) for a in label])
            padded_labels=[]
            for l in label:
                padded_label=np.pad(l, (0, max_len - len(l)), mode="constant", constant_values=self.padding_value)
                padded_labels.append(padded_label)
            padded_labels=np.array(padded_labels)
            return data,padded_labels
        label =label[:self.max_word_length]
        return data,np.pad(label, (0, self.max_word_length - len(label)), mode="constant", constant_values=self.padding_value)

class SpectrogramPadding(Transformer):
    def __init__(self, max_spectrogram_length: int, max_text_length: int):
        self.max_spectrogram_length = max_spectrogram_length
        self.max_text_length = max_text_length
    def __call__(self, data: typing.Any, label: typing.Any, *args, **kwargs):
        return data, self.transform(label)


In [8]:
dataset_path = "./LJSpeech-1.1"
metadata_path = dataset_path + "/metadata.csv"
wavs_path = dataset_path + "/wavs/"

In [9]:
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen

metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
metadata_df = metadata_df[["file_name", "normalized_transcription"]]
metadata_df.head()


Unnamed: 0,file_name,normalized_transcription
0,LJ001-0001,"Printing, in the only sense with which we are ..."
1,LJ001-0002,in being comparatively modern.
2,LJ001-0003,For although the Chinese took impressions from...
3,LJ001-0004,"produced the block books, which were the immed..."
4,LJ001-0005,the invention of movable metal letters in the ...


In [19]:
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

In [32]:
dataset = [[f"./LJSpeech-1.1/wavs/{file}.wav", label] for file, label in metadata_df.values.tolist()]
configs = ModelConfigs()

max_text_len, max_spectrogram_length = 0, 0
for file_path, label in tqdm(dataset):
    spectrogram = WavReader.get_spectrogram(file_path,frame_length=configs.frame_length,frame_step=configs.frame_step,fft_length=configs.fft_length)
    valid_label=[c for c in label if c in configs.vocab]
    max_text_len = max(max_text_len,len(valid_label))
    max_spectrogram_length = max(max_spectrogram_length,spectrogram.shape[0])
    configs.input_shape = [max_spectrogram_length,spectrogram.shape[1]]
configs.max_spectrogram_length = max_spectrogram_length
configs.max_text_length = max_text_len
configs.save()



100%|██████████| 13100/13100 [08:39<00:00, 25.21it/s]


In [28]:
data_provider = DataProvider(dataset,configs)


Defaulting to user installation because normal site-packages is not writeable
Collecting onnx==1.16.1
  Downloading onnx-1.16.1-cp312-cp312-win_amd64.whl.metadata (16 kB)
Downloading onnx-1.16.1-cp312-cp312-win_amd64.whl (14.4 MB)
   ---------------------------------------- 0.0/14.4 MB ? eta -:--:--
   ---------------------------------------- 0.1/14.4 MB 2.6 MB/s eta 0:00:06
    --------------------------------------- 0.2/14.4 MB 2.2 MB/s eta 0:00:07
   - -------------------------------------- 0.4/14.4 MB 3.2 MB/s eta 0:00:05
   - -------------------------------------- 0.7/14.4 MB 3.8 MB/s eta 0:00:04
   -- ------------------------------------- 0.8/14.4 MB 3.6 MB/s eta 0:00:04
   -- ------------------------------------- 0.9/14.4 MB 3.6 MB/s eta 0:00:04
   --- ------------------------------------ 1.2/14.4 MB 4.0 MB/s eta 0:00:04
   ---- ----------------------------------- 1.5/14.4 MB 4.1 MB/s eta 0:00:04
   ---- ----------------------------------- 1.8/14.4 MB 4.3 MB/s eta 0:00:03
   ---

