In [1]:
import json
import pathlib

from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
mapping_path = pathlib.Path('data') / 'mappings.json'

dataset_names = [
    'iggy12345/ru-reviews-classification-ipa',
    'iggy12345/allegro-reviews-ipa'
]

# Inventory Analysis
Now I want to see which phonemes are in each language and which are shared

In [3]:
datasets = list(map(load_dataset, dataset_names))

In [4]:
def collect_characters(ds, split: str, current_set: set):
    for row in tqdm(ds[split]):
        for ci, c in enumerate(row['text-phoneme']):
            current_set.add(c)

character_sets = [
    set(),
    set(),
]
for di, dataset in enumerate(datasets):
    collect_characters(dataset, 'train', character_sets[di])
    collect_characters(dataset, 'validation', character_sets[di])

print(character_sets[0])
print(character_sets[1])


100%|██████████| 45000/45000 [00:01<00:00, 32582.43it/s]
100%|██████████| 15000/15000 [00:00<00:00, 31751.51it/s]
100%|██████████| 9577/9577 [00:00<00:00, 21350.86it/s]
100%|██████████| 1002/1002 [00:00<00:00, 20931.41it/s]

{'ʌ', '!', 'ж', 'т', 'п', '=', 'щ', 'Z', 'j', 'J', '-', 'о', '。', '➖', '@', '´', 'А', '}', 't', 'М', '）', 'K', 'ı', '∀', 'h', '$', '…', 'ɵ', 'ŋ', 'x', '»', 'ф', 'ú', 'З', 'ɪ', 'ю', 'O', 'Д', 'ヽ', 'W', 'И', 'v', '1', '%', 'О', 'é', 'ﾉ', 'к', 'Y', 'П', 'ш', 'ñ', ':', 'ɭ', 'Й', '♡', '（', 'Е', '‘', 'T', 'm', 'I', '#', 'ɡ', 'ё', 'F', '★', 'ɕ', 'C', 'в', '❣', 'ɛ', '/', '•', 'Ц', '☺', 'n', '_', 'ы', 'Л', '–', 'Б', '7', 'H', '“', 'ä', 'Х', 'ц', '？', 'ɔ', '№', 'Ы', 's', '◡', 'p', 'u', 'b', 'ə', '₽', 'м', 'f', 'S', 'U', '，', 'с', '％', 'Щ', 'q', '°', '4', 'B', 'í', 'й', 'ʃ', '?', '*', 'c', 'z', '"', '^', 'Ф', ']', '❤', '❌', '+', '️', 'К', 'y', 'я', '|', '≈', '➕', '(', '’', '́', 'ʲ', 'N', '✔', 'р', 'ʒ', 'R', '℅', ' ', 'ß', '2', '✌', 'В', 'н', '\n', 'ş', 'E', ';', 'g', '8', '0', '©', 'ó', '⭐', '❄', 'е', 'д', 'Я', '✩', 'ˈ', 'ʑ', 'э', 'х', '❗', '×', 'X', '☝', '«', 'Г', 'ω', 'ч', '－', 'd', '3', '\\', 'л', 'a', '‼', 'o', '6', '5', '！', 'У', 'k', '⛔', 'D', 'Ш', ',', '¡', 'á', '☹', '✨', 'V', 'С', 'Ж', 'P




Lets filter datasets to get rid of these extra characters

In [11]:
with open(mapping_path, 'r') as f:
    mappings = json.load(f)

In [14]:
real_ipa_chars = set(mappings['mappings'].keys())

In [15]:
character_sets = [set([c for c in cset if c in real_ipa_chars]) for cset in character_sets]

In [16]:
print(character_sets[0])
print(character_sets[1])

{'ʌ', 'ʒ', 'j', 't', 'ɔ', 'h', 's', 'ɵ', 'ŋ', 'x', 'p', 'u', 'b', 'ə', 'ɪ', 'f', 'ç', 'ʑ', 'q', 'ɑ', 'v', 'w', 'i', 'ɭ', 'ʃ', 'c', 'z', 'd', 'a', 'm', 'ɡ', 'o', 'l', 'ɕ', 'e', 'y', 'k', 'ɛ', 'r', 'n'}
{'ɲ', 'ʒ', 'ʂ', 'j', 't', 'ɔ', 'h', 's', 'ɨ', 'ŋ', 'p', 'x', 'u', 'b', 'f', 'ɹ', 'ç', 'ʑ', 'q', 'v', 'w', 'i', 'ʃ', 'z', 'c', 'd', 'a', 'm', 'ɡ', 'o', 'l', 'ɕ', 'e', 'y', 'k', 'ɛ', 'ɣ', 'r', 'n'}


## Disjoint phonemes

In [17]:
russian_phonemes = character_sets[0] - character_sets[1]
polish_phonemes = character_sets[1] - character_sets[0]

In [18]:
print(russian_phonemes)

{'ʌ', 'ɑ', 'ɵ', 'ə', 'ɭ', 'ɪ'}


In [19]:
print(polish_phonemes)

{'ɨ', 'ɲ', 'ʂ', 'ɣ', 'ɹ'}


## Overlapping Phonemes

In [20]:
shared_phonemes = character_sets[0] & character_sets[1]

In [21]:
print(shared_phonemes)

{'ʒ', 'j', 't', 'ɔ', 'h', 's', 'ŋ', 'p', 'x', 'u', 'b', 'f', 'ç', 'ʑ', 'q', 'v', 'w', 'i', 'ʃ', 'z', 'c', 'd', 'a', 'm', 'ɡ', 'o', 'l', 'ɕ', 'e', 'y', 'k', 'ɛ', 'r', 'n'}
