In [1]:
import json
import pathlib

from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mapping_path = pathlib.Path('data') / 'mappings.json'

dataset_names = [
    'iggy12345/ru-reviews-classification-ipa',
    'iggy12345/allegro-reviews-ipa'
]

# Inventory Analysis
Now I want to see which phonemes are in each language and which are shared

In [3]:
datasets = list(map(load_dataset, dataset_names))

In [4]:
def collect_characters(ds, split: str, current_set: dict):
    for row in tqdm(ds[split]):
        for ci, c in enumerate(row['text-phoneme']):
            if c not in current_set:
                current_set[c] = 1
            else:
                current_set[c] += 1

character_sets = [
    {},
    {},
]
for di, dataset in enumerate(datasets):
    collect_characters(dataset, 'train', character_sets[di])
    collect_characters(dataset, 'validation', character_sets[di])

print(character_sets[0])
print(character_sets[1])


100%|██████████| 45000/45000 [00:01<00:00, 28396.66it/s]
100%|██████████| 15000/15000 [00:00<00:00, 28434.01it/s]
100%|██████████| 9577/9577 [00:00<00:00, 16217.64it/s]
100%|██████████| 1002/1002 [00:00<00:00, 15766.94it/s]

{'f': 57312, 's': 343919, 'ʲ': 820597, 'ˈ': 1087370, 'ɵ': 9276, ' ': 1235619, 'p': 214322, 'r': 342261, 'i': 437693, 'ʃ': 183779, 'ɭ': 276356, 'o': 267394, 'a': 280427, 'b': 80519, 'ʌ': 557284, '.': 140602, 't': 553372, 'k': 272929, 'n': 423682, 'm': 160677, 'v': 236206, 'e': 248997, 'j': 220634, 'ʒ': 48046, 'y': 150556, 'd': 174949, 'ɑ': 335726, '\n': 8004, 'u': 182348, 'T': 559, 'h': 3328, 'c': 1975, 'l': 5247, 'H': 86, 'w': 933, 'g': 1191, 'I': 614, 'q': 200, 'z': 103608, ',': 127128, '!': 41334, '"': 8940, 'ɛ': 76100, 'x': 49753, 'ɕ': 18670, 'ɪ': 140816, '3': 5894, '1': 15703, 'ɡ': 45729, '(': 11202, 'ʑ': 10388, '4': 14406, '6': 8210, 'ˌ': 14666, '5': 8455, '0': 13395, '-': 5822, '2': 11178, '8': 6366, ')': 11158, '^': 1682, '/': 1328, 'A': 327, 'Х': 92, 'L': 5034, 'О': 339, 'Г': 104, 'Б': 107, 'X': 3282, 'с': 1701, 'м': 1331, ':': 2263, '\\': 37, '7': 5442, 'C': 177, 'D': 280, 'E': 272, 'K': 86, '9': 4481, 'а': 1559, 'е': 1240, 'н': 715, 'я': 318, 'S': 2145, '%': 573, 'д': 464, 'л




Lets filter datasets to get rid of these extra characters

In [5]:
with open(mapping_path, 'r') as f:
    mappings = json.load(f)

In [6]:
real_ipa_chars = set(mappings['mappings'].keys())

In [7]:
character_supports = [{c: s for c, s in cset.items() if c in real_ipa_chars} for cset in character_sets]
character_sets = [set(sup.keys()) for sup in character_supports]

In [8]:
print(character_sets[0])
print(character_sets[1])

{'h', 'l', 'ɛ', 'p', 'r', 'm', 'v', 'j', 'y', 'z', 'n', 'b', 'ç', 'ɵ', 'ɕ', 'ɪ', 'i', 's', 'c', 't', 'ɡ', 'ɭ', 'f', 'ŋ', 'ɔ', 'e', 'u', 'ʌ', 'x', 'ʃ', 'ʑ', 'ɑ', 'a', 'o', 'w', 'ʒ', 'q', 'd', 'k', 'ə'}
{'l', 'h', 'ɹ', 'ɛ', 'ʂ', 'p', 'r', 'm', 'v', 'j', 'y', 'z', 'n', 'b', 'ç', 'ɕ', 'i', 'ɣ', 's', 'ɡ', 't', 'ŋ', 'c', 'f', 'ɲ', 'ɔ', 'e', 'u', 'x', 'ʃ', 'ʑ', 'a', 'o', 'ɨ', 'w', 'ʒ', 'q', 'd', 'k'}


## Disjoint phonemes

In [9]:
russian_phonemes = character_sets[0] - character_sets[1]
polish_phonemes = character_sets[1] - character_sets[0]

In [24]:
print(russian_phonemes)
print(len(russian_phonemes))
for c in sorted(russian_phonemes, key=lambda c: character_supports[0][c], reverse=True):
    print(f'\ipa{{{c}}} & {character_supports[0][c]:,d} \\\\')

{'ɑ', 'ʌ', 'ɵ', 'ɪ', 'ə', 'ɭ'}
6
\ipa{ʌ} & 557,284 \\
\ipa{ɑ} & 335,726 \\
\ipa{ɭ} & 276,356 \\
\ipa{ɪ} & 140,816 \\
\ipa{ɵ} & 9,276 \\
\ipa{ə} & 27 \\


In [25]:
print(polish_phonemes)
print(len(polish_phonemes))
for c in sorted(polish_phonemes, key=lambda c: character_supports[1][c], reverse=True):
    print(f'\ipa{{{c}}} & {character_supports[1][c]:,d} \\\\')

{'ɹ', 'ɨ', 'ʂ', 'ɣ', 'ɲ'}
5
\ipa{ɨ} & 176,323 \\
\ipa{ɲ} & 107,841 \\
\ipa{ɣ} & 137 \\
\ipa{ɹ} & 14 \\
\ipa{ʂ} & 2 \\


## Overlapping Phonemes

In [26]:
shared_phonemes = character_sets[0] & character_sets[1]
shared_supports = {
    c: (character_supports[0][c] if c in character_supports[0] else 0) + (character_supports[1][c] if c in character_supports[1] else 0)
    for c in shared_phonemes
}

In [27]:
print(len(shared_phonemes))
for c in sorted(shared_phonemes, key=lambda c: shared_supports[c], reverse=True):
    print(f'\ipa{{{c}}} & {shared_supports[c]:,d} \\\\')

34
\ipa{t} & 878,504 \\
\ipa{a} & 668,252 \\
\ipa{n} & 578,842 \\
\ipa{i} & 567,808 \\
\ipa{s} & 508,215 \\
\ipa{ɛ} & 488,556 \\
\ipa{r} & 469,087 \\
\ipa{k} & 425,868 \\
\ipa{v} & 363,888 \\
\ipa{ɔ} & 351,778 \\
\ipa{p} & 344,045 \\
\ipa{j} & 336,394 \\
\ipa{u} & 308,375 \\
\ipa{d} & 305,238 \\
\ipa{ʃ} & 290,236 \\
\ipa{m} & 287,712 \\
\ipa{o} & 267,668 \\
\ipa{e} & 249,277 \\
\ipa{z} & 170,217 \\
\ipa{y} & 150,635 \\
\ipa{b} & 145,341 \\
\ipa{ɕ} & 116,897 \\
\ipa{f} & 102,456 \\
\ipa{l} & 100,836 \\
\ipa{ɡ} & 96,677 \\
\ipa{ʒ} & 89,503 \\
\ipa{x} & 79,724 \\
\ipa{w} & 73,836 \\
\ipa{ʑ} & 41,256 \\
\ipa{ŋ} & 9,850 \\
\ipa{h} & 3,745 \\
\ipa{c} & 2,234 \\
\ipa{ç} & 700 \\
\ipa{q} & 207 \\
