In [1]:
import os
from collections import defaultdict

In [2]:
def count_word_occurrences(file_paths):
    word_count = defaultdict(lambda: [0, set()])  # Default dictionary to hold (count, set of identifiers)
    
    # Read each file and count words
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                identifier, text = line.split(' ', 1)
                words = text.split()
                
                for word in words:
                    # Normalize words to lowercase for consistent counting
                    normalized_word = word.lower()
                    word_count[normalized_word][0] += 1  # Increment word count
                    word_count[normalized_word][1].add(identifier)  # Add identifier

    # Create a sorted list from the dictionary
    sorted_word_counts = [
        (count_info[0], word, list(count_info[1])) 
        for word, count_info in word_count.items()
    ]
    sorted_word_counts.sort(key=lambda x: x[0], reverse=True)  # Sort by count (descending)

    return sorted_word_counts

def count_speaker_occurrences(identifiers):
    speaker_count = defaultdict(int)  # Default dictionary to hold counts of each speaker

    # Count occurrences of each speaker
    for identifier in identifiers:
        speaker_number = identifier.split('-')[0]  # Extract speaker number
        speaker_count[speaker_number] += 1  # Increment the count for the speaker

    # Convert to a regular dictionary for better readability
    return dict(speaker_count)

In [3]:
# file_paths = [r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\198\19-198.trans.txt', 
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\227\19-227.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\495\26-495.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\496\26-496.trans.txt']

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\122255\201-122255.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\127786\201-127786.trans.txt"]

#file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\311\124404\311-124404.trans.txt"]
file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\1069\133699\1069-133699.trans.txt",
              r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\1069\133709\1069-133709.trans.txt"]

result = count_word_occurrences(file_paths)

In [None]:
result = count_word_occurrences(file_paths)
for word in result:
    print(word)

In [7]:
for word in result:
    if word[1] == 'which':
        print(word[1])
        print(word[2])
        result_speaker = count_speaker_occurrences(word[2])
        print(result_speaker)

which
['87-121553-0025', '87-121553-0092', '87-121553-0014', '87-121553-0007', '87-121553-0101', '87-121553-0067', '87-121553-0089', '87-121553-0023', '87-121553-0004', '87-121553-0057', '87-121553-0049', '87-121553-0070', '87-121553-0077', '87-121553-0061', '87-121553-0106', '87-121553-0031', '87-121553-0028', '87-121553-0075', '87-121553-0036', '87-121553-0071', '87-121553-0006', '87-121553-0029', '87-121553-0048', '87-121553-0012', '87-121553-0053', '87-121553-0058', '87-121553-0026', '87-121553-0035', '87-121553-0017', '87-121553-0060', '87-121553-0000', '87-121553-0084', '87-121553-0016', '87-121553-0104']
{'87': 34}


In [10]:
def count_string_in_subfolders(root_folder, target_string):
    result = []

    # Walk through the directory structure
    for subfolder in os.listdir(root_folder):
        subfolder_path = os.path.join(root_folder, subfolder)
        
        if os.path.isdir(subfolder_path):
            total_count = 0
            subsubfolders_with_string = []

            # Walk through the subsubfolders within each subfolder
            for subsubfolder in os.listdir(subfolder_path):
                subsubfolder_path = os.path.join(subfolder_path, subsubfolder)

                if os.path.isdir(subsubfolder_path):
                    subsubfolder_count = 0

                    # Look for the text file in each subsubfolder
                    for file_name in os.listdir(subsubfolder_path):
                        if file_name.endswith(".txt"):
                            file_path = os.path.join(subsubfolder_path, file_name)

                            try:
                                with open(file_path, 'r') as file:
                                    content = file.read()
                                    subsubfolder_count += content.count(target_string)
                            except Exception as e:
                                print(f"Error reading {file_path}: {e}")

                    if subsubfolder_count > 0:
                        total_count += subsubfolder_count
                        subsubfolders_with_string.append(subsubfolder)

            # Append the result as a tuple (count, subfolder name, subsubfolders list)
            if total_count > 0:
                result.append((total_count, subfolder, subsubfolders_with_string))
    
    # Sort the result by the count in descending order (greatest to least)
    result.sort(key=lambda x: x[0], reverse=True)

    return result

In [11]:
root = r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100"
word_count = count_string_in_subfolders(root, "THAT")
# 131231
# 131232
print(word_count)

[(120, '87', ['121553']), (117, '3240', ['131231', '131232']), (112, '4297', ['13006', '13009']), (109, '7800', ['283478', '283492', '283493']), (104, '311', ['124404']), (102, '2989', ['138028', '138035']), (100, '5463', ['39173', '39174']), (93, '201', ['122255', '127786']), (92, '460', ['172357', '172359']), (87, '3723', ['171115', '171631']), (84, '3242', ['67153', '67168', '8112']), (84, '481', ['123719', '123720']), (83, '4406', ['16882', '16883']), (83, '5789', ['57158', '57195', '70653']), (83, '5867', ['48852']), (81, '103', ['1240', '1241']), (81, '2514', ['149482']), (81, '3214', ['167602', '167606', '167607']), (80, '1069', ['133699', '133709']), (80, '1867', ['148436', '154071', '154075']), (78, '374', ['180298', '180299']), (76, '26', ['495', '496']), (76, '5393', ['19218', '19219']), (76, '8465', ['246940', '246942', '246943', '246947']), (75, '3879', ['173592', '174923']), (74, '5678', ['43301', '43302', '43303']), (73, '2518', ['154825', '154826']), (73, '307', ['12753