In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer
import os
import random

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)
tf.config.list_physical_devices('GPU')

[]

In [3]:
# Set seed for reproducibility
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Define constants
SR = 16000  # Sample rate
FRAME_LENGTH = 1024  # Frame length for spectrogram
FRAME_STEP = 256  # Frame step for spectrogram
FFT_LENGTH = FRAME_LENGTH  # FFT length for spectrogram
NUM_MEL_BINS = 128  # Number of mel bins for spectrogram
BATCH_SIZE = 32
EPOCHS = 20

# Define paths to data and metadata
WAVS_PATH = os.path.join(os.getcwd(), "dataset\\LJSpeech-1.1\\")
METADATA_PATH = os.path.join(os.getcwd(), "target.csv")

In [4]:
metadata_df=pd.DataFrame(columns=['file_name','normalized_transcription'])
with open('target.csv',encoding='utf-8') as f:
    for line in f:
        parts=line.strip().split('|')
        metadata_df.loc[len(metadata_df.index)] = [parts[0],parts[1]] 

In [5]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(metadata_df)

         file_name                           normalized_transcription
0       LJ001-0001  Printing, in the only sense with which we are ...
1       LJ001-0002                     in being comparatively modern.
2       LJ001-0003  For although the Chinese took impressions from...
3       LJ001-0004  produced the block books, which were the immed...
4       LJ001-0005  the invention of movable metal letters in the ...
5       LJ001-0006  And it is worth mention in passing that, as an...
6       LJ001-0007  the earliest book printed with movable types, ...
7       LJ001-0008                          has never been surpassed.
8       LJ001-0009  Printing, then, for our purpose, may be consid...
9       LJ001-0010  Now, as all books not primarily intended as pi...
10      LJ001-0011  it is of the first importance that the letter ...
11      LJ001-0012  especially as no more time is occupied, or cos...
12      LJ001-0013        than in the same operations with ugly ones.
13      LJ001-0014  

In [6]:
split = int(len(metadata_df) * 0.90)
df_train = metadata_df[:split]
df_val = metadata_df[split:]

print(f"Size of the training set: {len(df_train)}")
print(f"Size of the training set: {len(df_val)}")

Size of the training set: 10852
Size of the training set: 1206


In [7]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
# Mapping characters to integers
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
# Mapping integers back to original characters
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

print(
    f"The vocabulary is: {char_to_num.get_vocabulary()} "
    f"(size ={char_to_num.vocabulary_size()})"
)

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' '] (size =31)


In [8]:
# An integer scalar Tensor. The window length in samples.
frame_length = 256
# An integer scalar Tensor. The number of samples to step.
frame_step = 160
# An integer scalar Tensor. The size of the FFT to apply.
# If not provided, uses the smallest power of 2 enclosing frame_length.
fft_length = 384


def encode_single_sample(wav_file, label):
    ###########################################
    ##  Process the Audio
    ##########################################
    try:
        # 1. Remove double quote from the beginning of the file name if it exists
        wav_file = tf.strings.regex_replace(wav_file, "^\"", "")
        # 2. Read wav file
        file = tf.io.read_file(WAVS_PATH + wav_file + ".wav")
        # ...rest of the code remains the same
    except tf.errors.NotFoundError:
        # Handle file not found error
        print(f"File not found: {wav_file}")
        return None, None
    # 2. Decode the wav file
    audio, _ = tf.audio.decode_wav(file)
    audio = tf.squeeze(audio, axis=-1)
    # 3. Change type to float
    audio = tf.cast(audio, tf.float32)
    # 4. Get the spectrogram
    spectrogram = tf.signal.stft(
        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
    )
    # 5. We only need the magnitude, which can be derived by applying tf.abs
    spectrogram = tf.abs(spectrogram)
    spectrogram = tf.math.pow(spectrogram, 0.5)
    # 6. normalisation
    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
    spectrogram = tf.expand_dims(spectrogram, axis=-1)
    ###########################################
    ##  Process the label
    ##########################################
    # 7. Convert label to Lower case
    label = tf.strings.lower(label)
    # 8. Split the label
    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
    # 9. Map the characters in label to numbers
    label = char_to_num(label)
    # 10. Return a dict as our model is expecting two inputs
    return spectrogram, label

In [9]:
df_train["normalized_transcription"]

0        Printing, in the only sense with which we are ...
1                           in being comparatively modern.
2        For although the Chinese took impressions from...
3        produced the block books, which were the immed...
4        the invention of movable metal letters in the ...
                               ...                        
10847    His travel would be in secret; his public appe...
10848    Any travel, any contact with the general publi...
10849    Such risks can be lessened when the President ...
10850    has confidence in the dedicated Secret Service...
10851    and accepts the necessary security precautions...
Name: normalized_transcription, Length: 10852, dtype: object

In [10]:
batch_size = 8
# Define the trainig dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_train["file_name"]), list(df_train["normalized_transcription"]))
)
train_dataset = (
    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

# Define the validation dataset
validation_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_val["file_name"]), list(df_val["normalized_transcription"]))
)
validation_dataset = (
    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [11]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss
    

In [12]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    """Model similar to DeepSpeech2."""
    # Model's input
    input_spectrogram = layers.Input((None, input_dim), name="input")
    # Expand the dimension to use 2D CNN.
    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
    # Convolution layer 1
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 41],
        strides=[2, 2],
        padding="same",
        use_bias=False,
        name="conv_1",
    )(x)
    x = layers.BatchNormalization(name="conv_1_bn")(x)
    x = layers.ReLU(name="conv_1_relu")(x)
    # Convolution layer 2
    x = layers.Conv2D(
        filters=32,
        kernel_size=[11, 21],
        strides=[1, 2],
        padding="same",
        use_bias=False,
        name="conv_2",
    )(x)
    x = layers.BatchNormalization(name="conv_2_bn")(x)
    x = layers.ReLU(name="conv_2_relu")(x)
    # Reshape the resulted volume to feed the RNNs layers
    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
    # RNN layers
    for i in range(1, rnn_layers + 1):
        recurrent = layers.GRU(
            units=rnn_units,
            activation="tanh",
            recurrent_activation="sigmoid",
            use_bias=True,
            return_sequences=True,
            reset_after=True,
            name=f"gru_{i}",
        )
        x = layers.Bidirectional(
            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
        )(x)
        if i < rnn_layers:
            x = layers.Dropout(rate=0.5)(x)
    # Dense layer
    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
    x = layers.ReLU(name="dense_1_relu")(x)
    x = layers.Dropout(rate=0.5)(x)
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
    # Model
    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
    # Optimizer
    opt = keras.optimizers.Adam(learning_rate=1e-4)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model


# Get the model
model = build_model(
    input_dim=fft_length // 2 + 1,
    output_dim=char_to_num.vocabulary_size(),
    rnn_units=512,
)
model.summary(line_length=110)

Model: "DeepSpeech_2"
______________________________________________________________________________________________________________
 Layer (type)                                    Output Shape                                Param #          
 input (InputLayer)                              [(None, None, 193)]                         0                
                                                                                                              
 expand_dim (Reshape)                            (None, None, 193, 1)                        0                
                                                                                                              
 conv_1 (Conv2D)                                 (None, None, 97, 32)                        14432            
                                                                                                              
 conv_1_bn (BatchNormalization)                  (None, None, 97, 32)                     

In [12]:
# A utility function to decode the output of the network
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, beam_width=15, top_paths=1, greedy=False)[0][0]
    # Iterate over the results and get back the text
    output_text = []
    for result in results:
        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
        output_text.append(result)
    return output_text


# A callback class to output a few transcriptions during training
class CallbackEval(keras.callbacks.Callback):
    """Displays a batch of outputs after every epoch."""

    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset

    def on_epoch_end(self, epoch: int, logs=None):
        predictions = []
        targets = []
        for batch in self.dataset:
            X, y = batch
            batch_predictions = model.predict(X)
            batch_predictions = decode_batch_predictions(batch_predictions)
            predictions.extend(batch_predictions)
            for label in y:
                label = (
                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
                )
                targets.append(label)
        wer_score = wer(targets, predictions)
        print("-" * 100)
        print(f"Word Error Rate: {wer_score:.4f}")
        print("-" * 100)
        for i in np.random.randint(0, len(predictions), 2):
            print(f"Target    : {targets[i]}")
            print(f"Prediction: {predictions[i]}")
            print("-" * 100)

In [14]:
# Define the number of epochs.
epochs = 10
# Train the model
model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=validation_dataset
)

# # If training was interrupted, load the last saved checkpoint and resume training
# checkpoint_dir = os.path.dirname(checkpoint_path)
# latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
# if latest_checkpoint is not None:
#     model.load_weights(latest_checkpoint)
#     model.fit(
#         train_dataset,
#         initial_epoch=last_epoch + 1,  # Set the initial epoch to the next epoch
#         epochs=epochs,
#         validation_data=validation_dataset,
#         callbacks=[cp_callback]
#     )

Epoch 1/10


: 

: 

In [None]:
os.makedirs("models", exist_ok=True)
model.save("models/deepspeech2.h5")

In [14]:
df_test = metadata_df[6000:6010]
test_dataset = tf.data.Dataset.from_tensor_slices(
    (list(df_test["file_name"]),list(df_test["normalized_transcription"]))
)
test_dataset = (
    test_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
    .padded_batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)

In [18]:
model=keras.models.load_model("models/deepspeech2.h5",custom_objects={'CTCLoss': CTCLoss})

In [13]:
df_test

NameError: name 'df_test' is not defined

In [21]:
predictions = []
targets = []
wer_score=0
for batch in validation_dataset:
    X, y = batch
    batch_predictions = model.predict(X)
    batch_predictions = decode_batch_predictions(batch_predictions)
    predictions.extend(batch_predictions)
    for label in y:
        label = (
            tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
        )
        targets.append(label)

    wer_score = wer(targets, predictions)+wer_score

total_samples = len(predictions)
average_wer = wer_score / total_samples

print(f"Average Word Error Rate across the validation dataset: {average_wer:.4f}")

print("-" * 100)
print("Random Samples:")
print("-" * 100)

for i in np.random.randint(0, len(predictions), 2):
    print(f"Target    : {targets[i]}")
    print(f"Prediction: {predictions[i]}")
    print("-" * 100)


Average Word Error Rate across the validation dataset: 0.0371
----------------------------------------------------------------------------------------------------
Random Samples:
----------------------------------------------------------------------------------------------------
Target    : to conduct an additional investigation of oswald in view of the activities which had led to his arrest
Prediction: to conducked in aditional invuestigation of oswald inview of the activities which hand led to his arest
----------------------------------------------------------------------------------------------------
Target    : had discussed the president's visit on several occasions including the regular biweekly conference on the morning of november 
Prediction: had discuse the presidenstes it onseverlicasions including the reguor by wekly confrence on the morning of november 
----------------------------------------------------------------------------------------------------


In [None]:
predictions

['five tablesponfuls a vieast',
 'oe even tablesponful of whie shuger',
 'one eventieasponful ofsalt',
 'bit of soda as large as ap desolved in hotwater',
 'one tablesponful of buter justmelted not hot',
 'yoke of on ag beaten lighe',
 'sif the flour salt andshuper into abol',
 'holow the hep in the center and por in the milk woriking down the flour into the liquid with a spon or yeor hands untilidis thoroughly melted',
 'into ascond holow por the east and ned thiruly for minus',
 'rap abal and biscat in a tic cloth and set tarisewhere it wil neither become chiled nor souer over nigt']

In [None]:
model.load_weights("models/deepspeech2.h5")
model.fit(
    train_dataset,
    initial_epoch=epochs + 10,
    epochs=epochs + 15,
    validation_data=validation_dataset
)

Epoch 21/25
 213/1357 [===>..........................] - ETA: 14:39 - loss: 31.7369

KeyboardInterrupt: 