<a href="https://colab.research.google.com/github/anjareddy/datascience/blob/main/MLDL_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mounting Google Drive** 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**UnZipping Input Audio Files**

In [None]:
!unzip /content/drive/MyDrive/MLDL-FINAL/LJSpeech.zip

Archive:  /content/drive/MyDrive/MLDL-FINAL/LJSpeech.zip
  inflating: LJSpeech/metadata.xlsx  
  inflating: LJSpeech/README         
   creating: LJSpeech/wavs/
  inflating: LJSpeech/wavs/LJ001-0001.wav  
  inflating: LJSpeech/wavs/LJ001-0002.wav  
  inflating: LJSpeech/wavs/LJ001-0003.wav  
  inflating: LJSpeech/wavs/LJ001-0004.wav  
  inflating: LJSpeech/wavs/LJ001-0005.wav  
  inflating: LJSpeech/wavs/LJ001-0006.wav  
  inflating: LJSpeech/wavs/LJ001-0007.wav  
  inflating: LJSpeech/wavs/LJ001-0008.wav  
  inflating: LJSpeech/wavs/LJ001-0009.wav  
  inflating: LJSpeech/wavs/LJ001-0010.wav  
  inflating: LJSpeech/wavs/LJ001-0011.wav  
  inflating: LJSpeech/wavs/LJ001-0012.wav  
  inflating: LJSpeech/wavs/LJ001-0013.wav  
  inflating: LJSpeech/wavs/LJ001-0014.wav  
  inflating: LJSpeech/wavs/LJ001-0015.wav  
  inflating: LJSpeech/wavs/LJ001-0016.wav  
  inflating: LJSpeech/wavs/LJ001-0017.wav  
  inflating: LJSpeech/wavs/LJ001-0018.wav  
  inflating: LJSpeech/wavs/LJ001-0019.wav  
  i

In [None]:
pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jiwer
  Downloading jiwer-3.0.1-py3-none-any.whl (21 kB)
Collecting rapidfuzz==2.13.7
  Downloading rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.1 rapidfuzz-2.13.7


**Library Importing**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras as layers
import matplotlib.pyplot as plt
from IPython import display
from jiwer import wer
import os
import librosa

# **Data Collection**

**Importing Input Dataset**

In [None]:
wavs_path = "/content/LJSpeech/wavs/"
metadata_path = "/content/LJSpeech/metadata.csv"

In [None]:
metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
metadata_df.columns = ["file_name", "transcript", "norm_transcript"]
metadata_df = metadata_df[["file_name", "norm_transcript"]]
metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)
metadata_df.head()
metadata_df.size

2838

**Set Train and Test Data**

In [None]:
split = int(len(metadata_df) * 0.90)
train_df = metadata_df[:split]
validation_df = metadata_df[split:]

In [None]:
train_df.size, validation_df.size

(2554, 284)

# **Pre-Processing**

In [None]:
characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
num_to_char = keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

print(char_to_num.get_vocabulary())

['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', ' ']


In [None]:
frame_length = 256
frame_step = 160
fft_length = 384
tf.config.run_functions_eagerly(True)
@tf.function
def encode_single_sample(wav_file, label):
  file = tf.io.read_file(wavs_path + wav_file + ".wav")
  audio, _ = tf.audio.decode_wav(file)
  audio, _ = tf.squeeze(audio, axis=-1)
  audio = tf.cast(audio, tf.float32)
  spectogram = tf.signam.stft(audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length)
  spectogram = tf.abs(spectogram)
  spectogram = tf.math.pow(spectogram, 0.5)
  means = tf.math.reduce_mean(spectogram, 1, keepdims=True)
  stddevs = tf.math.reduce_std(spectogram, 1, keepdims=True)
  spectogram = (spectogram - means) / (stddevs + 1e-10)
  label = tf.strings.lower(label)
  return spectogram, label


In [None]:
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((list(train_df["file_name"]), list(train_df["norm_transcript"])))
train_dataset = (train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE).padded_batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE))

validation_dataset = tf.data.Dataset.from_tensor_slices((list(validation_df["file_name"]), list(validation_df["norm_transcript"])))
validation_dataset = (validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE).padded_batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE))


OperatorNotAllowedInGraphError: ignored

**Visualize**

In [None]:
fig = plt.figure(figsize=(8,5))
for batch in train_dataset.take(1):
  spectogram = batch[0][0].numpy()
  spectogram = np.array([np.trim_zeros(x) for x in np.transpose(spectogram)])
  label = batch[1][0]
  #Spectogram
  label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
  ax = plt.subplot(2, 1, 1)
  ax.imshow(spectogram, vmax = 1)
  ax.set_title(label)
  ax.axis("off")
  #wav
  file = tf.io.read_file(wavs_path + list(train_df["file_name"])[0] + ".wav")
  audio, _ = tf.audio.decode_wav(file)
  audio = audio.numpy()
  ax = plt.subplot(2, 1, 2)
  plt.plot(audio)
  ax.set_title("Signal Wave")
  ax.set_xlim(0, len(audio))
  display.display(display.Audio(np.transpose(audio), rate=16000))

plt.show()

InvalidArgumentError: ignored

<Figure size 800x500 with 0 Axes>

# **MODEL**

**Define Loss Function**

In [None]:
def CTCLOSS(y_true, y_pred):
  batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
  input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
  label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

  input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
  label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

  loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
  return loss
  

**Define Model**

In [None]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
  #Model's input
  input_spectogram = layers.Input((None, input_dim), name="input")
  #Expanding the dimension to use 2D CNN.
  x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectogram)
  #CNN layer-1
  x = layers.Conv2D(filter=32, kernel_size=[11, 41], strides=[2,2], padding="same", use_bias=False, name="conv_1")(x)
  x = layers.BatchNormalization(name="conv_1_bn")(x)
  x = layers.ReLU(name="conv_1_relu")(x)
  #CNN layer-2
  x = layers.Conv2D(filter=32, kernel_size=[11, 21], strides=[1,2], padding="same", use_bias=False, name="conv_2")(x)
  x = layers.BatchNormalization(name="conv_2_bn")(x)
  x = layers.ReLU(name="conv_2_relu")(x)
  #Reshape result to feed to RNNs
  x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
  # RNN Layers
  for i in range(1, rnn_layers + 1):
    recurrent = layers.GRU(units=rnn_units, activation="tanh", recurrent_activation="sigmoid", use_bias=True, return_sequences=True, reset_after=True, name=f"gru_{i}")
    x = layers.Bidirectional(recurrent, name=f"bidirectional_{i}", merge_mode="concat")(x)
    if i < rnn_layers:
      x = layers.Dropout(rate=0.5)(x)

  #classification layer
  output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
  #Model
  model = keras.Model(input_spectogram, output, name="Speech_To_Text_Final")
  #optimizer
  opt = keras.optimizers.Adam(learning_rate=1e-4)
  model.compile(optimzer=opt, loss=CTCLOSS)
  return model
  

**Build Model**

In [None]:
model = build_model(input_dim=fft_length // 2 + 1, output_dim=char_to_num.vocabulary_size(), rnn_units=512)
model.summary(line_length=110)

NameError: ignored

# **Training and Evaluating**

In [None]:
def decode_batch_predictions(pred):
  input_len = np.ones(pred.shape[0]) * pred.shape[1]
  results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
  output_text = []
  for result in results:
    result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
    output_text.append(result)
  return output_text
  


**Training Starts**

In [None]:
epochs = 20
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)

**Inference**

In [None]:
predictions = []
targets = []
for batch in validation_dataset:
  X, y = batch
  batch_predictions = model.predict(X)
  batch_predictions = decode_batch_predictions(batch_predictions)
  predictions.extend(batch_predictions)
  for label in y:
    label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
    targets.append(label)
wer_score = wer(targets, predictions)
print("-"*100)
print(f"Word Error Rate: {wer_score:.4f}")
print("-"*100)
for i in np.random.randit(0, len(predictions), 5):
  print(f"Target  : {targets[i]}")
  print(f"Prediction  : {predictions[i]}")
  print("-"*100)
  
  

In [None]:
def convert_audio_to_data():
  data = []
  for audio_file in os.listdir(wavs_path):
    audio, sr = librosa.load(os.path.join(wavs_path, audio_file), sr=None)
    audio = librosa.util.normalize(audio)
    audio = librosa.effects.preemphasis(audio)
    # Frame the audio
    frame_length = int(sr * 0.025)  # 25 ms
    hop_length = int(sr * 0.010)  # 10 ms
    frames = librosa.util.frame(audio, frame_length=frame_length, hop_length=hop_length)
    # Apply DFT
    spectrograms = librosa.stft(frames, n_fft=400, hop_length=hop_length, window='hamming')
    # Apply Mel filterbank
    mel_filterbank = librosa.filters.mel(sr=sr, n_fft=400, n_mels=40)
    mel_spectrograms = mel_filterbank.dot(spectrograms)
    # Apply IDCT
    mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel_spectrograms), n_mfcc=20, dct_type=2)
    # Normalize MFCCs
    mfccs_norm = librosa.util.normalize(mfccs, axis=1)
    data.append(mfccs)
  return data


In [None]:
converted_data = convert_audio_to_data()
converted_data

  mfccs = librosa.feature.mfcc(S=librosa.power_to_db(mel_spectrograms), n_mfcc=20, dct_type=2)


KeyboardInterrupt: ignored