# Analyze word and phones counts

In [2]:
def count_words_phones(alignments, words_counts, phones_counts):
    """
    Count the number of words and phones
    """
    for key in alignments:
        for xmin, xmax, text in alignments[key]:
            if key == 'phones':
                if text not in phones_counts:
                   phones_counts[text] = 0
                phones_counts[text] += 1
            if key == 'words':
                if text not in words_counts:
                   words_counts[text] = 0
                words_counts[text] += 1
    return words_counts, phones_counts

In [3]:
import os
from alignments.utils_alignments import parse_textgrid

alignments_dir = '../datasets/alignments'

words_counts = {}
phones_counts = {}

for root, dirs, files in os.walk(alignments_dir):
    for file in files:
        if file.endswith(".TextGrid"):
            textgrid_path = os.path.join(root, file)
            alignments = parse_textgrid(textgrid_path)
            words_counts, phones_counts  = count_words_phones(alignments, words_counts, phones_counts)


In [4]:
import json
file_name = 'words_counts.json'

with open(file_name, 'w') as file:
    json.dump(words_counts, file, indent=4)

file_name1 = 'phones_counts.json'

with open(file_name1, 'w') as file:
    json.dump(phones_counts, file, indent=4)

In [5]:
phones_counts

{'sil': 5082,
 'AY1': 6080,
 'W': 8340,
 'UH1': 1774,
 'D': 17977,
 'AH0': 29414,
 'N': 26333,
 'T': 25466,
 'G': 3260,
 'IH1': 9551,
 'V': 7719,
 'HH': 8071,
 'AE1': 9801,
 'F': 6836,
 'P': 6936,
 'EH2': 498,
 'IY0': 7108,
 'R': 15513,
 'ER0': 8352,
 'DH': 11685,
 'AH1': 8134,
 'OW1': 4051,
 'L': 14913,
 'AA1': 4739,
 'sp': 16887,
 'M': 10784,
 'AY2': 351,
 'K': 9849,
 'EH1': 10214,
 'IH0': 13416,
 'S': 17463,
 'IY1': 6738,
 'EY1': 5407,
 'Z': 10575,
 'UW1': 4305,
 'AA2': 207,
 'B': 6440,
 'OY1': 366,
 'SH': 2912,
 'ER1': 2142,
 'AO1': 4942,
 'CH': 2158,
 'TH': 2417,
 'NG': 3880,
 'JH': 1672,
 'Y': 2554,
 'UW0': 382,
 'AW1': 2131,
 'AH2': 172,
 'OW2': 198,
 'IH2': 403,
 'spn': 356,
 'OW0': 510,
 'EH0': 253,
 'EY2': 320,
 'AE0': 116,
 'ZH': 186,
 'UW2': 134,
 'AY0': 109,
 'AE2': 265,
 'AO2': 168,
 'AA0': 131,
 'UH2': 40,
 'EY0': 46,
 'AW2': 79,
 'AO0': 87,
 'IY2': 173,
 'ER2': 32,
 'OY2': 8,
 'UH0': 12,
 'AW0': 9}

## Analize unique/differents tokens for Glove, Bert, Wav2vec2

In [4]:
from tqdm import tqdm
import os 
import json

directory = '../experiments'
model1 = 'wav2vec2'
model2 = 'glove'
model3 = 'bert-base-uncased'

bert_keys = set()
glove_keys = set()
wav2vec2_keys = set()

files = sorted(os.listdir(os.path.join(directory, model1)))
for filename in tqdm(files):
    identifier = filename.split('_')[-1]
    model2_path = f'{model2}/embeddings_words_{identifier}'
    model3_path = f'{model3}/embeddings_words_{identifier}'

    model1_path = os.path.join(directory, os.path.join(model1,filename))
    model2_path = os.path.join(directory, model2_path)
    model3_path = os.path.join(directory, model3_path)

    if os.path.isfile(model2_path):
        with open(model1_path, 'r') as model1_file, open(model2_path, 'r') as model2_file, open(model3_path, 'r') as model3_file:
            model1_data = json.load(model1_file)
            model2_data = json.load(model2_file)
            model3_data = json.load(model3_file)

            # Filter keys that appear in all files
            audio, _ = os.path.splitext(identifier)
            bert_keys.update(model3_data[audio].keys())
            glove_keys.update(model2_data[audio].keys())
            wav2vec2_keys.update(model1_data[audio].keys())

100%|██████████| 2703/2703 [07:57<00:00,  5.66it/s]


In [13]:
common_keys = bert_keys & glove_keys & wav2vec2_keys
len(common_keys)


8001

In [12]:
all_keys = bert_keys | glove_keys | wav2vec2_keys
len(all_keys)

8372

In [10]:
all_keys - common_keys

{"'",
 '<unk>',
 'ain',
 "ain't",
 "alexander's",
 'ambrosch',
 "ann's",
 "antonia's",
 "apostle's",
 'aren',
 "aren't",
 "aunt's",
 "author's",
 'avrigny',
 "baby's",
 'balvastro',
 'bambeday',
 'beenie',
 'bennydeck',
 'bergez',
 'bhunda',
 'birdikins',
 'blanco',
 "blanco's",
 'blemmyes',
 'boolooroo',
 "boy's",
 'bozzle',
 'brandd',
 'brau',
 'breadhouse',
 'brewer',
 "brewer's",
 "bright's",
 "brother's",
 "brown's",
 'burgoynes',
 "can't",
 'canyou',
 "cap'n",
 "captain's",
 "captive's",
 "catherine's",
 'chaba',
 'charlie',
 "charlie's",
 "child's",
 "chunky's",
 "church's",
 'collander',
 "commandant's",
 'congal',
 "connell's",
 "cook's",
 'corncakes',
 'couldn',
 "couldn't",
 'creeters',
 'culprit',
 "culprit's",
 'daguerreotypist',
 "dante's",
 'daren',
 "daren't",
 'darfhulva',
 'darwin',
 "darwin's",
 "david's",
 "day's",
 'delaunay',
 'delectasti',
 'delia',
 "delia's",
 'dent',
 "dent's",
 'derivatively',
 "detective's",
 'dhourra',
 'didn',
 "didn't",
 'docetes',
 'does

In [11]:
common = 'common_keys.json'

with open(common, 'w') as file:
    json.dump(list(common_keys), file, indent=4)


In [5]:
import os
from tqdm import tqdm
import json

model1 = 'wav2vec2'
directory = '../experiments'
words_order = []
files = sorted(os.listdir(os.path.join(directory, model1)))
model2 = 'bert-base-uncased'
model3 = 'glove'

for filename in tqdm(files):
    identifier = filename.split('_')[-1]
    model2_path = f'{model2}/embeddings_words_{identifier}'
    model3_path = f'{model3}/embeddings_words_{identifier}'

    model1_path = os.path.join(directory, f'{model1}/{filename}')
    model2_path = os.path.join(directory, model2_path)
    model3_path = os.path.join(directory, model3_path)

    if os.path.isfile(model2_path):
        with open(model1_path, 'r') as model1_file, open(model2_path, 'r') as model2_file, open(model3_path, 'r') as model3_file:
            model1_data = json.load(model1_file)
            model2_data = json.load(model2_file)
            model3_data = json.load(model3_file)

            # Filter keys that appear in all files
            audio, _ = os.path.splitext(identifier)
            common_keys = set(model1_data[audio].keys()) & set(model2_data[audio].keys()) & set(model3_data[audio].keys())
            common_keys = sorted(common_keys)
            for key in common_keys:
                words_order.append([key, identifier]) 
        
with open(os.path.join('words_in_order_audio.json'), 'w') as f:
    json.dump(words_order, f) 

  0%|          | 0/2703 [00:00<?, ?it/s]

100%|██████████| 2703/2703 [14:02<00:00,  3.21it/s]
