In [None]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Dropout, BatchNormalization, Bidirectional
from sklearn.model_selection import train_test_split

In [None]:
# Load audio file
def load_audio(file_path, sr=16000):
    audio, _ = librosa.load(file_path, sr=sr)
    return audio

In [None]:
# Convert audio to MFCC features
def extract_features(audio, sr=16000, n_mfcc=13):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfccs.T

In [None]:
# Sample dataset
audio_files = ["audio1.wav", "audio2.wav"]
texts = ["hello world", "speech to text"]
features = [extract_features(load_audio(f)) for f in audio_files]
max_len = max([f.shape[0] for f in features])
padded_features = [np.pad(f, ((0, max_len - f.shape[0]), (0, 0)), mode='constant') for f in features]

In [None]:
# Text encoding
char_map = {char: idx+1 for idx, char in enumerate("abcdefghijklmnopqrstuvwxyz ")}
text_encoded = [[char_map[char] for char in text] for text in texts]
max_text_len = max(len(t) for t in text_encoded)
text_encoded = [t + [0]*(max_text_len - len(t)) for t in text_encoded]

In [None]:
# Data preparation
X = np.array(padded_features)
y = np.array(text_encoded)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model definition
model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True, input_shape=(max_len, 13))),
    BatchNormalization(),
    Dropout(0.3),
    Bidirectional(LSTM(64, return_sequences=True)),
    TimeDistributed(Dense(len(char_map)+1, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:
# Training
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))

In [None]:
# Prediction
predicted = model.predict(X_test)
decoded_text = "".join([list(char_map.keys())[np.argmax(char)] for char in predicted[0]])
print("Predicted Text:", decoded_text)

In [None]:
# Visualization
plt.figure(figsize=(10, 4))
plt.imshow(features[0].T, cmap='viridis', aspect='auto', origin='lower')
plt.colorbar()
plt.title("MFCC Features")
plt.xlabel("Time Frames")
plt.ylabel("MFCC Coefficients")
plt.show()