In [4]:
import tensorflow as tf
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=characters,oov_token="")
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(),oov_token="",invert=True)


print(f"Vocabulary is: {char_to_num.get_vocabulary()}")
print(f"size is: {char_to_num.vocabulary_size()}")

Vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' ']
size is: 31


In [2]:
frame_length = 256
frame_step = 160
fft_length = 384

In [None]:
# Load the audio file and squeeze it to mono
audio, sr = librosa.load('audio_file.wav', sr=16000, mono=True)

In [None]:
import tensorflow as tf
import librosa



# Define the function to preprocess the audio
def preprocess_audio(audio):

  # Convert the audio to float32 and reshape to a 2D tensor
  audio = tf.reshape(tf.cast(audio, tf.float32), [1, -1])

  # Perform STFT with absolute value and raise to the power of 0.5
  stft = tf.abs(tf.signal.stft(audio, frame_length=512, frame_step=160, fft_length=512))

  # Raise the spectrogram to the power of 0.5
  spectrogram = tf.pow(stft, 0.5)

  # Compute the mean and standard deviation of the spectrogram
  mean = tf.reduce_mean(spectrogram, 1, keepdims=True)
  stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)

  # Normalize the spectrogram by subtracting the mean and dividing by the standard deviation
  spectrogram = (spectrogram - mean) / (stddevs + 1e-10)

  # Convert the label to lowercase
  label = tf.strings.lower(label)

  # Split the label into a list of Unicode characters
  label_chars = tf.strings.unicode_split(label, 'UTF-8')

  label = char_to_num(label)
  return spectrogram

In [None]:
# Define the function to preprocess the labels
def preprocess_label(label):
    # Convert the label to lowercase and split into Unicode characters
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, 'UTF-8')
    return label

In [None]:
# Load the audio files and labels
train_audio, train_labels = [], []
valid_audio, valid_labels = [], []
# ... load and split train/validation audio and labels ...

# Define the function to preprocess the audio
def preprocess_audio(audio):
    # Convert the audio to float32 and reshape to a 2D tensor
    audio = tf.reshape(tf.cast(audio, tf.float32), [1, -1])

    # Perform STFT with absolute value and raise to the power of 0.5
    stft = tf.abs(tf.signal.stft(audio, frame_length=512, frame_step=160, fft_length=512))
    spectrogram = tf.pow(stft, 0.5)

    # Compute the mean and standard deviation and normalize the spectrogram
    mean = tf.reduce_mean(spectrogram)
    stddevs = tf.math.reduce_std(spectrogram)
    spectrogram = (spectrogram - mean) / (stddevs + 1e-10)

    return spectrogram

# Define the function to preprocess the labels
def preprocess_label(label):
    # Convert the label to lowercase and split into Unicode characters
    label = tf.strings.lower(label)
    label = tf.strings.unicode_split(label, 'UTF-8')
    return label

# Create the train dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_audio, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=len(train_labels))
train_dataset = train_dataset.map(lambda audio, label: (preprocess_audio(audio), preprocess_label(label)))
train_dataset = train_dataset.padded_batch(batch_size=32, padded_shapes=([None, None], [None]))
train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Create the validation dataset
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_audio, valid_labels))
valid_dataset = valid_dataset.map(lambda audio, label: (preprocess_audio(audio), preprocess_label(label)))
valid_dataset = valid_dataset.padded_batch(batch_size=32, padded_shapes=([None, None], [None]))
valid_dataset = valid_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

# Define the function to plot the audio signal, spectrogram, and transcription
def plot_audio(audio, label):
    # Preprocess the audio and label
    spectrogram = preprocess_audio(audio)
    label = preprocess_label(label)

    # Plot the audio signal
    plt.figure(figsize=(10, 5))
    plt.plot(audio)
    plt.title('Audio Signal')
    plt.xlabel('Time (samples)')
    plt.ylabel('Amplitude')
    plt.show()

    # Plot the spectrogram
    plt.figure(figsize=(10, 5))
    plt.imshow(spectrogram.T, origin='lower', cmap='gray', aspect='auto')
    plt.title('Spectrogram')
    plt.xlabel('Time (frames)')
    plt.ylabel('Frequency (bins)')
    plt.show()

    # Print the transcription
    transcription = ''.join(tf.strings.unicode_decode(label, 'UTF-8').numpy())
    print('Transcription:', transcription)

# Visualize the first element of the train dataset
plot_audio(train_audio[0], train_labels[0])

In [12]:


def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
    input_spectrogram = layers.Input(shape=(None, input_dim))

    # Reshape the input to a 4D tensor
    x = layers.Reshape((-1, input_dim, 1))(input_spectrogram)

    # Apply 2D convolutional layers to learn features from the input
    x = layers.Conv2D(filters=32, kernel_size=(11, 41), strides=[2,2],activation='relu', padding='same',use_bias= False)(x)
    x = layers.BatchNormalization()(x)


    x = layers.Conv2D(filters=32, kernel_size=(11, 21), strides=[1,2],activation='relu', padding='same',use_bias= False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Reshape((-1, x.shape[-2]*x.shape[-1]))(x)




    # Apply recurrent layers to learn temporal dependencies in the input
    for i in range(rnn_layers):
    	recurrent = layers.GRU(
    		units = rnn_units,
    		activation = 'tanh',
    		recurrent_activation = 'sigmoid',
    		use_bias = True,
    		return_sequences = True,
    		reset_after = True

    	)


    x = layers.Bidirectional(recurrent ,merge_mode='concat')(x)
    if i < rnn_layers:
      x= layers.Dropout(rate=0.5)(x)

    # Map the output to a prediction over characters'
    x = layers.Dense(units=rnn_units*2, activation='relu')(x)
    x= layers.Dropout(rate=0.5)(x)



    #+1 is for the blank symbol used in the CTC (Connectionist Temporal Classification) loss function.
    x = layers.Dense(units=output_dim+1, activation='softmax')(x)

    # Define the input and output of the model
    model = tf.keras.models.Model(inputs=input_spectrogram, outputs=x)

    return model

In [11]:
model = build_model(input_dim = fft_length // 2 + 1 , output_dim= char_to_num.vocabulary_size() , rnn_units = 512)
# Print the model summary
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None, 193)]       0         
                                                                 
 reshape_4 (Reshape)         (None, None, 193, 1)      0         
                                                                 
 conv2d_4 (Conv2D)           (None, None, 97, 32)      14432     
                                                                 
 batch_normalization_4 (Batc  (None, None, 97, 32)     128       
 hNormalization)                                                 
                                                                 
 conv2d_5 (Conv2D)           (None, None, 49, 32)      236544    
                                                                 
 batch_normalization_5 (Batc  (None, None, 49, 32)     128       
 hNormalization)                                           

In [None]:
# Plot the model architecture
plot_model(model, to_file='deepspeech2.png', show_shapes=True, show_layer_names=True)

In [None]:
def decode_batch_predictions(pred):
	input_len = np.ones(pred.shape[0]) * pred.shape[1]
	# Use Greedy Search , For complex task , you can use beas=m search
	results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
	output_text = []
	for result in results:
		result = tf.strings.reduce_join(num_to_char(result)).numpy().decode('utf-8')
		output_text.append(result)
	return output_text



##Inference

In [None]:
predictions = []
targets = []

for batch in validation_dataset:
	X,y = batch
	batch_predictions = model.predict(X)
	batch_predictions = decode_batch_predictions(batch_predictions)
	predictions.extend(batch_predictions)
	for label in y:
		label = tf.strings.reduce_join(num_to_char(label)).numpy().decode('utf-8')
		targets.append(label)

wer_score = wer(targets, predictions)
print(f"WER : {wer_score}")