# Analyze word and phones counts

In [2]:
def count_words_phones(alignments, words_counts, phones_counts):
    """
    Count the number of words and phones
    """
    for key in alignments:
        for xmin, xmax, text in alignments[key]:
            if key == 'phones':
                if text not in phones_counts:
                   phones_counts[text] = 0
                phones_counts[text] += 1
            if key == 'words':
                if text not in words_counts:
                   words_counts[text] = 0
                words_counts[text] += 1
    return words_counts, phones_counts

In [3]:
import os
from utils_alignments import parse_textgrid

alignments_dir = '../datasets/alignments'

words_counts = {}
phones_counts = {}

for root, dirs, files in os.walk(alignments_dir):
    for file in files:
        if file.endswith(".TextGrid"):
            textgrid_path = os.path.join(root, file)
            alignments = parse_textgrid(textgrid_path)
            words_counts, phones_counts  = count_words_phones(alignments, words_counts, phones_counts)


In [9]:
import json
file_name = 'words_counts.json'

with open(file_name, 'w') as file:
    json.dump(words_counts, file, indent=4)

file_name1 = 'phones_counts.json'

with open(file_name1, 'w') as file:
    json.dump(phones_counts, file, indent=4)

In [5]:
phones_counts

{'sil': 5082,
 'AY1': 6080,
 'W': 8340,
 'UH1': 1774,
 'D': 17977,
 'AH0': 29414,
 'N': 26333,
 'T': 25466,
 'G': 3260,
 'IH1': 9551,
 'V': 7719,
 'HH': 8071,
 'AE1': 9801,
 'F': 6836,
 'P': 6936,
 'EH2': 498,
 'IY0': 7108,
 'R': 15513,
 'ER0': 8352,
 'DH': 11685,
 'AH1': 8134,
 'OW1': 4051,
 'L': 14913,
 'AA1': 4739,
 'sp': 16887,
 'M': 10784,
 'AY2': 351,
 'K': 9849,
 'EH1': 10214,
 'IH0': 13416,
 'S': 17463,
 'IY1': 6738,
 'EY1': 5407,
 'Z': 10575,
 'UW1': 4305,
 'AA2': 207,
 'B': 6440,
 'OY1': 366,
 'SH': 2912,
 'ER1': 2142,
 'AO1': 4942,
 'CH': 2158,
 'TH': 2417,
 'NG': 3880,
 'JH': 1672,
 'Y': 2554,
 'UW0': 382,
 'AW1': 2131,
 'AH2': 172,
 'OW2': 198,
 'IH2': 403,
 'spn': 356,
 'OW0': 510,
 'EH0': 253,
 'EY2': 320,
 'AE0': 116,
 'ZH': 186,
 'UW2': 134,
 'AY0': 109,
 'AE2': 265,
 'AO2': 168,
 'AA0': 131,
 'UH2': 40,
 'EY0': 46,
 'AW2': 79,
 'AO0': 87,
 'IY2': 173,
 'ER2': 32,
 'OY2': 8,
 'UH0': 12,
 'AW0': 9}

In [12]:
pip install numpy

Collecting numpy
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m84.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


In [13]:
import numpy as np
def load_glove_embeddings(file_path):
    embeddings_dict = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict


def get_embeddings_glove(input_text, glove_embeddings):
    words = input_text.split()
    
    embeddings = {
        word: [glove_embeddings.get(word, None)] * 12 for word in words
    }
    
    # Filter out words that are not found in the GloVe embeddings
    embeddings = {
        word: vecs for word, vecs in embeddings.items() if vecs[0] is not None
    }
    
    return embeddings    

In [14]:
glove_file_path = '../datasets/glove.42B.300d.txt'

glove_embeddings = load_glove_embeddings(glove_file_path)
input_text = 'hi how are you'
embeddings_audio = get_embeddings_glove(input_text, glove_embeddings)
    


In [32]:
keys = glove_embeddings.keys()

In [36]:
len(set(words_counts.keys()))

11715

In [37]:
set(words_counts.keys()) - set(glove_embeddings.keys())

{'affrightened',
 "ain't",
 "alexander's",
 "ann's",
 "anne's",
 "another's",
 "antonia's",
 "apostle's",
 "aren't",
 "argyle's",
 "aunt's",
 "author's",
 "baby's",
 "balaam's",
 "bear's",
 "beggar's",
 'blacknesses',
 "blanco's",
 'blemmyes',
 "boy's",
 "brewer's",
 "bright's",
 "brother's",
 "brown's",
 "bubble's",
 'bunnit',
 "captain's",
 "captive's",
 "carpaccio's",
 "catherine's",
 "charlie's",
 "chaucer's",
 "child's",
 "children's",
 "christ's",
 "chunky's",
 "church's",
 "clergyman's",
 "commandant's",
 "connell's",
 "consid'ble",
 'constrainedly',
 "consumer's",
 "cook's",
 "couldn't",
 "country's",
 "court's",
 "cousin's",
 'creeters',
 "culprit's",
 "cumberland's",
 "customer's",
 "cynthia's",
 "dante's",
 "daren't",
 "darwin's",
 "david's",
 "dawn's",
 "day's",
 "delia's",
 "dent's",
 "detective's",
 "dinah's",
 'disburdened',
 'dorriforth',
 "dragon's",
 "dramatist's",
 "druggist's",
 "dummy's",
 'egoisms',
 "either's",
 "elmo's",
 "else's",
 "emperor's",
 "enemy's",
 'en

In [38]:
len(set(words_counts.keys()) - set(glove_embeddings.keys()))

316