In [None]:
import os
import re
from collections import defaultdict
from pocketsphinx import AudioFile, get_model_path, Decoder
from pydub import AudioSegment, split_on_silence

In [143]:
def find_keyword_context(sentence, keyword):
    # Split the sentence into words
    words = sentence.lower().split()
    
    # Initialize an empty list to store the results
    context = []
    
    # Iterate over the words and check for the keyword
    for i in range(1, len(words) - 1):  # avoid first and last word for context
        if words[i] == keyword:
            # Get the word before and after the keyword
            before = words[i - 1]
            after = words[i + 1]
            # Add the context to the result list
            context.append((before, keyword, after))
    
    return context

def segment_word(input_wav, output_wav, ground_truth, target_word, buffer=0, sample_rate=1600):
    model_path = get_model_path()

    word_times = []

    config = {
        'verbose': False,
        'hmm': os.path.join(model_path, 'en-us'),  # Path to the acoustic model
        'lm': os.path.join(model_path, 'en-us.lm.bin'),  # Path to the language model
        'dict': os.path.join(model_path, 'cmudict-en-us.dict'),  # Path to the dictionary
        'frate': 100,
        'samprate': 16000
    }

    # Initialize the decoder
    decoder = Decoder(config)

    # Start decoding the audio file
    decoder.start_utt()
    
    # Read the audio file in binary mode and process it
    with open(input_wav, 'rb') as audio_file:
        while True:
            buf = audio_file.read(1024)
            if not buf:
                break
            decoder.process_raw(buf, False, False)

    decoder.end_utt()

    context = find_keyword_context(ground_truth, target_word)
    print(context)

    recognized_words = []
    context_counter = 1
    seg_list = [seg for seg in decoder.seg()]
    for index, seg in enumerate(seg_list):
        word = seg.word.split('(')[0]  # Get the word before the hypothesis index
        
        recognized_words.append(word)  # Collect all recognized words
        
        if word.lower() == target_word.lower():
            # print(seg_list[index-1].word.split('(')[0] == context[1][0], word, seg_list[index+1].word.split('(')[0] == context[1][2])
            # print(seg_list[index-1].word.split('(')[0], word, seg_list[index+1].word.split('(')[0])
            for context_idx in range(0, len(context)): # this loop 
                if (seg_list[index-1].word.split('(')[0] == context[context_idx][0]) or (seg_list[index+1].word.split('(')[0] == context[context_idx][2]):
                    # Append start and end times (in milliseconds)
                    start_time = int(seg_list[index-1].end_frame * (1000 / 100))  # Assuming 100 frames per second
                    end_time = int(seg_list[index+1].start_frame * (1000 / 100))
                    adjusted_start = max(0, start_time - buffer)  # One second before
                    adjusted_end = end_time + buffer
                    word_times.append((adjusted_start, adjusted_end))
    
    print("Recognized words:", ' '.join(recognized_words))

    # if not word_times:
    #     for index, seg in enumerate(seg_list):
    #         word = seg.word.split('(')[0]  # Get the word before the hypothesis index
    #         if index-1 >= 0:
    #              for context_idx in range(0, len(context)):
    #                 if (seg_list[index-1].word.split('(')[0] == context[context_idx][0]) and (seg_list[index+2].word.split('(')[0] == context[context_idx][2]):
    #                     start_time = int(seg_list[index-1].end_frame * (1000 / 100))  # Assuming 100 frames per second
    #                     end_time = int(seg_list[index+1].start_frame * (1000 / 100))
    #                     adjusted_start = max(0, start_time - buffer)  # One second before
    #                     adjusted_end = end_time + buffer
    #                     word_times.append((adjusted_start, adjusted_end))
    
    if not word_times:
        print(f"The word '{target_word}' was not found in the audio {input_wav}")
        return

    original_audio = AudioSegment.from_wav(input_wav)

    # Extract segments for each occurrence of the target word
    for i, (start, end) in enumerate(word_times):
        # Ensure the start and end times are within the audio length
        start = max(0, start)
        end = min(len(original_audio), end)
        
        # Extract the audio segment
        word_audio = original_audio[start:end]
        
        # Save the extracted segment using the original sample rate
        name_without_extension = os.path.splitext(output_wav)[0]
        output_file = f"{name_without_extension}_{i + 1}.wav"
        word_audio.export(output_file, format="wav")
        print(f"Extracted '{target_word}' to '{output_file}' from {start}ms to {end}ms.")
    
        # Save the extracted segment to the current directory
        # output_file = f"extracted_word_{i + 1}.wav"
        # word_audio.export(output_file, format="wav")
        # print(f"Extracted '{target_word}' to '{output_file}' from {start}ms to {end}ms.")


In [155]:
def count_word_occurrences(file_paths):
    word_count = defaultdict(lambda: [0, set()])  # Default dictionary to hold (count, set of identifiers)
    
    # Read each file and count words
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                identifier, text = line.split(' ', 1)
                words = text.split()
                
                for word in words:
                    # Normalize words to lowercase for consistent counting
                    normalized_word = word.lower()
                    word_count[normalized_word][0] += 1  # Increment word count
                    word_count[normalized_word][1].add(identifier)  # Add identifier

    # Create a sorted list from the dictionary
    sorted_word_counts = [
        (count_info[0], word, list(count_info[1])) 
        for word, count_info in word_count.items()
    ]
    sorted_word_counts.sort(key=lambda x: x[0], reverse=True)  # Sort by count (descending)

    return sorted_word_counts

def count_speaker_occurrences(identifiers):
    speaker_count = defaultdict(int)  # Default dictionary to hold counts of each speaker

    # Count occurrences of each speaker
    for identifier in identifiers:
        speaker_number = identifier.split('-')[0]  # Extract speaker number
        speaker_count[speaker_number] += 1  # Increment the count for the speaker

    # Convert to a regular dictionary for better readability
    return dict(speaker_count)

def generate_path(filename, path_prefix):
    reader, chapter, trial = filename.split('-')
    return f"{path_prefix}/{reader}/{chapter}/{reader}-{chapter}-{trial}.wav"

def get_ground_truth(filename):
    path_prefix = 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/LibriSpeech/train-clean-100'
    speaker, chapter, sentence = filename.split("-")
    path_to_ground_truths = f"{path_prefix}/{speaker}/{chapter}/{speaker}-{chapter}.trans.txt"

    with open(path_to_ground_truths, 'r') as file:
        for line in file:
            # Split the line by the dash and get the last element (the last number)
            parts = line.split('-')
            last_number = parts[-1].strip().split()[0]  # Extract number and remove surrounding whitespace
            
            # If the last number matches the search number, extract and return the rest of the line
            if last_number == str(sentence):
                # Get the part after the last number (remove the number and dashes)
                remainder = ' '.join(parts[-1].strip().split()[1:])
                return(remainder.lower())

def word_extraction_wrapper(files_to_search, save_to_path, word):
    for file in files_to_search:
        _, _, _, _, _, _, _, speaker, chapter, filename = file.split('/')
        
        ground_truth = get_ground_truth(os.path.splitext(filename)[0])
        print(ground_truth)
        save_to = f"{save_to_path}/{word}_before_after/{speaker}/{filename}"
        segment_word(file, save_to, ground_truth, word)

In [158]:
# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]

file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\122255\201-122255.trans.txt",
              r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\127786\201-127786.trans.txt",
              r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\311\124404\311-124404.trans.txt"]

spoken_words = count_word_occurrences(file_paths)
for word in spoken_words:
    print(word)

(593, 'the', ['311-124404-0069', '311-124404-0036', '201-122255-0037', '201-122255-0035', '311-124404-0009', '201-127786-0014', '201-127786-0080', '311-124404-0034', '311-124404-0015', '311-124404-0117', '201-127786-0064', '201-127786-0050', '311-124404-0039', '311-124404-0060', '201-127786-0077', '311-124404-0075', '311-124404-0089', '311-124404-0100', '311-124404-0004', '311-124404-0090', '311-124404-0066', '311-124404-0115', '201-127786-0078', '201-122255-0014', '311-124404-0062', '311-124404-0008', '311-124404-0119', '311-124404-0006', '311-124404-0052', '201-122255-0020', '201-127786-0006', '311-124404-0011', '201-127786-0045', '311-124404-0037', '201-127786-0007', '201-127786-0075', '201-127786-0057', '311-124404-0002', '311-124404-0080', '201-127786-0005', '311-124404-0014', '311-124404-0085', '311-124404-0097', '201-127786-0081', '201-127786-0069', '201-127786-0059', '311-124404-0088', '201-122255-0013', '201-122255-0017', '201-127786-0027', '201-122255-0016', '201-122255-0036'

In [159]:
for word in spoken_words:
    if word[1] == "that":
        filenames = word[2]

path_prefix  = "C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/cleaned"
save_to_path = "C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words"

files_to_search = [generate_path(filename, path_prefix) for filename in filenames]

word_extraction_wrapper(files_to_search, save_to_path, "that")

and of having been the first to teach that there are many small passages at the extremities of the arteries through which the blood received by them from the heart passes into the small branches of the veins
[('teach', 'that', 'there')]
Recognized words: <s> <sil> and as having been the first to teach <sil> that there are many small passages <sil> a teacher mix of the or east <sil> through which the blood <sil> perceived by them from the market <sil> passes into small branches of the things </s>
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that_before_after/311/311-124404-0069_1.wav' from 2090ms to 2250ms.
of the same matter with that i had described and at first placed in it no rational soul nor any other principle in room of the vegetative or sensitive soul beyond kindling in the heart one of those fires without light
[('with', 'that', 'i')]
Recognized words: <s> of the same <sil> matter <sil> with that <sil> i had to scroll it

### Tests