In [9]:
# Importing the libraries that I need
import librosa
import numpy as np
import deepspeech
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial.distance import cityblock, cdist
import enchant
import textdistance

# Load the tiny pre-trained model
model = deepspeech.Model('deepspeech-0.9.3-models (2).pbmm')

# Transcribe the audio file using DeepSpeech
y, sr = librosa.load('./Wave Audio Files (Research)/Plum.wav', sr=16000, dtype=np.float64)
transcript = model.stt((y * 32767).astype(np.int16))

# Read the words from a text file and convert it to a set
with open('words2.txt', 'r') as file:
    word_list = set(file.read().split())

# Define a dictionary that maps each character to its index in the original MFCCs array
words = transcript.split()
char_to_index = {char: j for i, word in enumerate(words) for j, char in enumerate(word)}

# Generate candidate transcripts by comparing the length of each word to the lengths of words in the text file
candidate_transcripts = []
for word in words:
    candidates = [candidate for candidate in word_list if abs(len(candidate) - len(word)) <= 1]
    candidate_transcripts += candidates

# Filter out candidate transcripts that are not equal in length to the original transcript
candidate_transcripts = [candidate for candidate in candidate_transcripts if len(candidate) == len(transcript)]

# Print the candidate transcripts
print("Candidate transcripts:")
print(*candidate_transcripts, sep='|')

# Compute the MFCCs of the audio file
mfccs = librosa.feature.mfcc(y=y, sr=sr)

# Compute the gradient of the audio file using the MFCCs
gradient = np.gradient(mfccs, axis=1)

# Normalize the gradient values to the range [0, 25]
gradient = np.round(gradient / np.max(np.abs(gradient)) * 25)

epsilon = 1e-8

# Compute the gradients of the candidate transcripts
candidate_gradients = []
for candidate in candidate_transcripts:
    candidate_mfccs = np.zeros((mfccs.shape[0], len(candidate)))
    for i, char in enumerate(candidate):
        # Map the character to an index in the MFCCs array, if it exists in the dictionary
        if char in char_to_index:
            candidate_mfccs[:, i] = mfccs[:, char_to_index[char]]
    candidate_mfccs = np.hstack((candidate_mfccs, np.zeros((candidate_mfccs.shape[0], max(0, mfccs.shape[1] - candidate_mfccs.shape[1])))))
    candidate_gradients.append(np.round(np.gradient(candidate_mfccs, axis=1) / (np.max(np.abs(candidate_mfccs)) + epsilon) * 25))


# Store all the candidate transcripts and distances in a list of tuples
candidates = []
for i, candidate_gradient in enumerate(candidate_gradients):
    # Compute the distances between the original gradient and the candidate gradient using different metrics
    euclidean_distance = euclidean_distances(np.nan_to_num(gradient.T), np.nan_to_num(candidate_gradient.T)).sum()
    manhattan_distance = cdist(gradient.T, candidate_gradient.T, 'cityblock').sum()
    minkowski_distance = np.power(np.power(np.abs(gradient.T - candidate_gradient.T), 4).sum(axis=1), 1/4).sum()

    # Compute the Levenshtein distance between the original transcript and the candidate transcript
    lev_distance = textdistance.levenshtein.normalized_similarity(transcript, candidate_transcripts[i])

    # Store the candidate transcript and distances in a tuple and add it to the list
    candidate = (candidate_transcripts[i], euclidean_distance, manhattan_distance, minkowski_distance, lev_distance)
    candidates.append(candidate)

# Sort the list of candidates based on the Levenshtein distance (from smallest to largest)
candidates = sorted(candidates, key=lambda x: x[4])

# Find the best candidate transcript with the smallest Levenshtein distance closer to 1
best_candidate_transcript = None
best_lev_distance = 0
for candidate in candidates:
    transcript, euclidean_distance, manhattan_distance, minkowski_distance, lev_distance = candidate
    if lev_distance > best_lev_distance:
        best_candidate_transcript = transcript
        best_lev_distance = lev_distance

# Print the best candidate transcript and its distances
print("Best candidate transcript: ",best_candidate_transcript)
print(f"Distance between the original gradient and the best candidate gradient using euclidean distance: {euclidean_distance}")
print(f"Distance between the original gradient and the best candidate gradient using Manhattan distance: {manhattan_distance}")
print(f"Distance between the original gradient and the best candidate gradient using Minkowski distance: {minkowski_distance}")
print("Levenshtein distance: ", best_lev_distance)


TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.0-0-g2d04fbe0


Candidate transcripts:
well|soar|wind|roll|deal|mist|left|gain|mass|time|scan|veil|area|will|bark|bake|rare|dead|axis|wrap|beam|post|long|head|beat|coat|seat|Mars|bird|walk|leaf|risk|side|like|role|spin|dash|pick|dull|loot|poor|call|link|doll|term|duck|wait|vain|easy|gasp|sell|rank|beef|monk|cave|fate|king|echo|rush|door|cafe|seal|skin|flex|ring|know|hide|dare|suit|cold|slab|blue|love|plum|duke|hook|fold|stab|crop|bean|lean|hope|flat|heel|pure|lift|tidy|even|clay|deep|zone|ally|site|rage|sour|nail
Best candidate transcript:  plum
Distance between the original gradient and the best candidate gradient using euclidean distance: 12337.656075086601
Distance between the original gradient and the best candidate gradient using Manhattan distance: 29034.0
Distance between the original gradient and the best candidate gradient using Minkowski distance: 228.41608381874252
Levenshtein distance:  1.0


In [2]:
# Importing the libraries that I need
import librosa
import numpy as np
import deepspeech
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial.distance import cityblock, cdist
import enchant
import textdistance

# Load the tiny pre-trained model
model = deepspeech.Model('deepspeech-0.9.3-models (2).pbmm')

# Transcribe the audio file using DeepSpeech
y, sr = librosa.load('./Wave Audio Files (Research)/Bird.wav', sr=16000, dtype=np.float64)
transcript = model.stt((y * 32767).astype(np.int16))

# Read the words from a text file and convert it to a set
with open('words2.txt', 'r') as file:
    word_list = set(file.read().split())

# Define a dictionary that maps each character to its index in the original MFCCs array
words = transcript.split()
char_to_index = {char: j for i, word in enumerate(words) for j, char in enumerate(word)}

# Generate candidate transcripts by comparing the length of each word to the lengths of words in the text file
candidate_transcripts = []
for word in words:
    candidates = [candidate for candidate in word_list if abs(len(candidate) - len(word)) <= 1]
    candidate_transcripts += candidates

# Filter out candidate transcripts that are not equal in length to the original transcript
candidate_transcripts = [candidate for candidate in candidate_transcripts if len(candidate) == len(transcript)]

# Print the candidate transcripts
print("Candidate transcripts:")
print(*candidate_transcripts, sep='|')

# Compute the MFCCs of the audio file
mfccs = librosa.feature.mfcc(y=y, sr=sr)

epsilon = 1e-8
    
# Normalize the MFCCs to the range [0, 25]
mfccs_normalized = np.round((mfccs - np.min(mfccs)) / (np.max(mfccs) - np.min(mfccs)) * 25)

# Define the function to compute the gradient of the MFCCs with respect to the parameters
def compute_gradient(params):
    gradient = np.zeros_like(params)
    gradient[:, 1:-1] = (params[:, 2:] - params[:, :-2]) / 2.0
    gradient[:, 0] = params[:, 1] - params[:, 0]
    gradient[:, -1] = params[:, -1] - params[:, -2]
    return gradient

# Initialize the gradient with zeros
gradient = np.zeros_like(mfccs)

# Define the learning rate for gradient descent
learning_rate = 0.01

# Perform gradient descent iterations
for _ in range(1000):
    # Compute the gradients of the candidate transcripts
    candidate_gradients = []
    for candidate in candidate_transcripts:
        candidate_mfccs = np.zeros((mfccs.shape[0], len(candidate)))
        for i, char in enumerate(candidate):
            # Map the character to an index in the MFCCs array, if it exists in the dictionary
            if char in char_to_index:
                candidate_mfccs[:, i] = mfccs_normalized[:, char_to_index[char]]
        candidate_mfccs = np.hstack((candidate_mfccs, np.zeros((candidate_mfccs.shape[0], max(0, mfccs.shape[1] - candidate_mfccs.shape[1])))))
        candidate_gradients.append(np.round(compute_gradient(candidate_mfccs) / (np.max(np.abs(candidate_mfccs)) + epsilon) * 25))

    # Update the gradient using gradient descent
    for candidate_gradient in candidate_gradients:
        gradient += learning_rate * candidate_gradient

# Normalize the gradient values to the range [0, 25]
gradient_normalized = np.round((gradient - np.min(gradient)) / (np.max(gradient) - np.min(gradient)) * 25)


# Store all the candidate transcripts and distances in a list of tuples
candidates = []
for i, candidate_gradient in enumerate(candidate_gradients):
    # Compute the distances between the original gradient and the candidate gradient using different metrics
    euclidean_distance = euclidean_distances(np.nan_to_num(gradient.T), np.nan_to_num(candidate_gradient.T)).sum()
    manhattan_distance = cdist(gradient.T, candidate_gradient.T, 'cityblock').sum()
    minkowski_distance = np.power(np.power(np.abs(gradient.T - candidate_gradient.T), 4).sum(axis=1), 1/4).sum()

    # Compute the Levenshtein distance between the original transcript and the candidate transcript
    lev_distance = textdistance.levenshtein.normalized_similarity(transcript, candidate_transcripts[i])

    # Store the candidate transcript and distances in a tuple and add it to the list
    candidate = (candidate_transcripts[i], euclidean_distance, manhattan_distance, minkowski_distance, lev_distance)
    candidates.append(candidate)

# Sort the list of candidates based on the Levenshtein distance (from smallest to largest)
candidates = sorted(candidates, key=lambda x: x[4])

# Find the best candidate transcript with the smallest Levenshtein distance closer to 1
best_candidate_transcript = None
best_lev_distance = 0
for candidate in candidates:
    transcript, euclidean_distance, manhattan_distance, minkowski_distance, lev_distance = candidate
    if lev_distance > best_lev_distance:
        best_candidate_transcript = transcript
        best_lev_distance = lev_distance

# Print the best candidate transcript and its distances
print("Best candidate transcript: ",best_candidate_transcript)
print(f"Distance between the original gradient and the best candidate gradient using euclidean distance: {euclidean_distance}")
print(f"Distance between the original gradient and the best candidate gradient using Manhattan distance: {manhattan_distance}")
print(f"Distance between the original gradient and the best candidate gradient using Minkowski distance: {minkowski_distance}")
print("Levenshtein distance: ", best_lev_distance)


TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.0-0-g2d04fbe0


Candidate transcripts:
well|soar|wind|roll|deal|mist|left|gain|mass|time|scan|veil|area|will|bark|bake|rare|dead|axis|wrap|beam|post|long|head|beat|coat|seat|Mars|bird|walk|leaf|risk|side|like|role|spin|dash|pick|dull|loot|poor|call|link|doll|term|duck|wait|vain|easy|gasp|sell|rank|beef|monk|cave|fate|king|echo|rush|door|cafe|seal|skin|flex|ring|know|hide|dare|suit|cold|slab|blue|love|plum|duke|hook|fold|stab|crop|bean|lean|hope|flat|heel|pure|lift|tidy|even|clay|deep|zone|ally|site|rage|sour|nail
Best candidate transcript:  bird
Distance between the original gradient and the best candidate gradient using euclidean distance: 1343595.7281546034
Distance between the original gradient and the best candidate gradient using Manhattan distance: 5856597.999998863
Distance between the original gradient and the best candidate gradient using Minkowski distance: 14877.645910877163
Levenshtein distance:  1.0
