In [1]:
from copy import deepcopy
import json
import pathlib

from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mapping_path = pathlib.Path('data') / 'mappings.json'

dataset_names = [
    'iggy12345/ru-reviews-classification-ipa',
    'iggy12345/allegro-reviews-ipa'
]

# Inventory Analysis
Now I want to see which phonemes are in each language and which are shared

In [3]:
datasets = list(map(load_dataset, dataset_names))

Generating train split: 100%|██████████| 45000/45000 [00:00<00:00, 201562.45 examples/s]
Generating validation split: 100%|██████████| 15000/15000 [00:00<00:00, 1305958.69 examples/s]
Generating test split: 100%|██████████| 15000/15000 [00:00<00:00, 1401027.92 examples/s]
Generating train split: 100%|██████████| 9577/9577 [00:00<00:00, 363650.64 examples/s]
Generating test split: 100%|██████████| 1006/1006 [00:00<00:00, 190357.75 examples/s]
Generating validation split: 100%|██████████| 1002/1002 [00:00<00:00, 208384.20 examples/s]


In [9]:
def collect_characters(ds, split: str, current_set: dict):
    for row in tqdm(ds[split]):
        for ci, c in enumerate(row['text']):
            if c not in current_set:
                current_set[c] = 1
            else:
                current_set[c] += 1

character_sets = [
    {},
    {},
]
for di, dataset in enumerate(datasets):
    collect_characters(dataset, 'train', character_sets[di])
    collect_characters(dataset, 'validation', character_sets[di])

print(character_sets[0])
print(character_sets[1])


100%|██████████| 45000/45000 [00:01<00:00, 26522.68it/s]
100%|██████████| 15000/15000 [00:00<00:00, 26044.50it/s]
100%|██████████| 9577/9577 [00:00<00:00, 16225.51it/s]
100%|██████████| 1002/1002 [00:00<00:00, 16114.31it/s]

{'в': 249958, 'с': 277455, 'ё': 14720, ' ': 1186632, 'п': 185282, 'р': 331208, 'и': 343567, 'ш': 78754, 'л': 274678, 'о': 680607, 'а': 607920, 'б': 85880, '.': 140602, 'т': 408315, 'ь': 127269, 'к': 258072, 'н': 401294, 'е': 526318, 'м': 151832, 'г': 67522, 'ч': 103169, 'я': 109476, 'ж': 45853, 'д': 165773, '\n': 8004, 'у': 145924, 'T': 559, 'h': 3328, 'e': 9084, 'c': 1975, 'o': 5361, 'l': 5247, 'r': 4438, 'f': 1083, 't': 6789, 'd': 2899, 's': 6281, 'i': 4919, 'n': 3885, 'a': 4659, 'm': 1912, 'p': 1659, 'u': 1948, 'H': 86, 'w': 932, 'g': 1191, 'I': 614, 'k': 805, 'q': 200, 'y': 1537, 'З': 7477, 'з': 123593, 'ы': 97476, 'й': 57350, 'ц': 35261, ',': 127128, '!': 41334, 'П': 17553, '"': 1910, 'А': 6181, 'х': 42547, 'Н': 19284, 'щ': 17765, 'О': 15891, 'ю': 30975, 'ф': 14673, '3': 5894, '1': 15703, 'Ч': 1417, 'К': 11130, '-': 18040, '(': 11202, 'э': 13699, 'Р': 8278, '4': 14406, '6': 8210, 'Т': 11111, 'Э': 1357, 'Б': 4471, '5': 8455, '0': 13395, '2': 11178, 'x': 2768, 'С': 11211, 'М': 8287,




Lets filter datasets to get rid of these extra characters

In [10]:
punctuation_characters = {
    ' ', '\n',
    '.', ',', '!', '?', ':', ';',
    '-', '–', '—', '…',
    '(', ')', '[', ']', '{', '}',
    '"', "'", '‘', '’', '“', '”', '«', '»',
    '/', '\\', '|',
    '+', '=', '*', '^', '<', '>', '×', '÷', '≈',
    '₽', '%', '№', '°',
    '。', '，', '！', '？', '－',
    '_', '~', '`'
} | {
    ' ',
    '.', ',', '!', '?', ':', ';',
    '-', '–', '—',
    '(', ')', '[', ']', '{', '}',
    '"', "'", '„', '”',
    '/', '\\', '|',
    '+', '=', '*', '^',
    '%', '$',
    '_', '@', '#'
}
russian_orthography = set("АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ")
polish_orthography = set("AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż")

In [11]:
unfiltered_character_sets = deepcopy(character_sets)
character_sets = [
    {k: v for k, v in unfiltered_character_sets[0].items() if (k in punctuation_characters or k in russian_orthography)},
    {k: v for k, v in unfiltered_character_sets[1].items() if (k in punctuation_characters or k in polish_orthography)},
]

In [12]:
# with open(mapping_path, 'r') as f:
#     mappings = json.load(f)

In [13]:
# real_ipa_chars = set(mappings['mappings'].keys())

In [14]:
character_supports = character_sets
character_sets = [set(sup.keys()) for sup in character_supports]

In [15]:
print(character_sets[0])
print(character_sets[1])

{'@', '~', 'О', "'", 'Ы', 'Ч', '`', 'Р', 'У', 'И', ':', '>', '*', '/', '_', 'Г', '«', '№', '!', '－', 'Ш', '}', '，', '=', 'С', 'Е', '+', 'М', 'Т', '$', '—', '.', '<', 'Й', '’', 'П', 'Ё', 'З', '₽', ',', '%', '？', ' ', '≈', 'Х', '！', '…', 'Щ', ']', ';', '?', 'Ц', 'К', 'Л', '\\', '°', '×', 'Ж', 'Ф', '#', 'Б', '|', 'Д', '”', '(', 'А', '。', ')', '^', 'Н', 'Э', 'В', '\n', '-', 'Ь', '–', '÷', '‘', '"', '»', '“', 'Ъ', 'Я', '[', 'Ю'}
{'c', '@', 'n', 'N', "'", 'j', 'R', 'Ź', 'Ó', 'Ę', 'm', 'b', ':', 'Ł', 'L', 'w', '*', '/', '_', 'l', '!', 'r', 'Ż', 'ź', 'D', 'M', 'h', 's', 'y', 'ń', 'P', 'ę', '}', 'Y', 'f', '=', '+', 'Ą', '$', 'Z', 'Ć', 'k', '—', '.', 't', 'A', 'Ń', 'F', ',', 'J', 'p', 'I', '%', 'o', 'T', 'ł', ' ', 'G', 'Ś', 'd', 'S', ']', ';', 'a', '?', 'K', 'g', 'O', '\\', 'ż', 'ć', '#', '|', '”', 'ś', '(', 'B', 'e', 'i', ')', 'E', '^', 'C', 'W', '-', 'u', '–', '„', 'U', 'ą', 'ó', 'z', '"', 'H', '{', '['}


## Disjoint phonemes

In [16]:
russian_phonemes = character_sets[0] - character_sets[1]
polish_phonemes = character_sets[1] - character_sets[0]

In [17]:
print(russian_phonemes)
print(len(russian_phonemes))
for ci, c in enumerate(sorted(russian_phonemes, key=lambda c: character_supports[0][c], reverse=True)):
    print(f'{ci+1}. & \ipa{{{c}}} & {character_supports[0][c]:,d} \\\\')

{'~', 'О', 'Д', 'Ы', '？', 'Ч', 'Ш', 'А', '。', '`', '≈', '，', 'Р', 'У', 'Х', 'С', '！', '…', 'Е', 'И', 'Н', 'Щ', 'Э', 'М', 'В', '\n', 'Ь', 'Т', '>', 'Ц', 'К', '÷', '<', 'Й', 'Л', 'Г', '°', '«', '№', '’', 'П', '×', '‘', '»', 'Ж', '“', 'Ё', 'З', 'Ф', 'Ъ', 'Я', 'Б', '₽', '－', 'Ю'}
55
1. & \ipa{Н} & 19,284 \\
2. & \ipa{П} & 17,553 \\
3. & \ipa{О} & 15,891 \\
4. & \ipa{С} & 11,211 \\
5. & \ipa{К} & 11,130 \\
6. & \ipa{Т} & 11,111 \\
7. & \ipa{В} & 10,212 \\
8. & \ipa{М} & 8,287 \\
9. & \ipa{Р} & 8,278 \\
10. & \ipa{Д} & 8,076 \\
11. & \ipa{
} & 8,004 \\
12. & \ipa{З} & 7,477 \\
13. & \ipa{А} & 6,181 \\
14. & \ipa{Е} & 5,741 \\
15. & \ipa{Б} & 4,471 \\
16. & \ipa{И} & 4,244 \\
17. & \ipa{У} & 3,079 \\
18. & \ipa{Я} & 2,756 \\
19. & \ipa{Л} & 2,608 \\
20. & \ipa{Х} & 2,519 \\
21. & \ipa{Ш} & 2,112 \\
22. & \ipa{Ц} & 1,697 \\
23. & \ipa{Г} & 1,522 \\
24. & \ipa{Ч} & 1,417 \\
25. & \ipa{Э} & 1,357 \\
26. & \ipa{Ж} & 1,286 \\
27. & \ipa{Ю} & 940 \\
28. & \ipa{Ь} & 865 \\
29. & \ipa{Ф} & 770 \\
30.

In [18]:
print(polish_phonemes)
print(len(polish_phonemes))
for ci, c in enumerate(sorted(polish_phonemes, key=lambda c: character_supports[1][c], reverse=True)):
    print(f'{ci+1}. & {c} & {character_supports[1][c]:,d} \\\\')

{'c', 'J', 'D', 'M', 'n', 'h', 'p', 'N', 's', 'I', 'y', 'ń', 'o', 'P', 'T', 'ł', 'ę', 'ś', 'B', 'e', 'j', 'R', 'i', 'Ź', 'G', 'Ś', 'Y', 'E', 'Ó', 'f', 'Ę', 'd', 'S', 'm', 'Ą', 'C', 'W', 'F', 'b', 'Ł', 'u', 'L', 'Z', '„', 'w', 'a', 'Ć', 'k', 'K', 'U', 'g', 'O', 'ą', 'ó', 'ż', 'l', 'z', 'H', 'ć', 'r', 't', 'A', 'Ń', '{', 'Ż', 'ź'}
66
1. & a & 379,008 \\
2. & e & 347,904 \\
3. & i & 310,633 \\
4. & o & 303,627 \\
5. & n & 225,317 \\
6. & z & 207,111 \\
7. & t & 167,521 \\
8. & r & 163,839 \\
9. & s & 151,341 \\
10. & w & 149,106 \\
11. & y & 144,723 \\
12. & k & 144,693 \\
13. & d & 140,266 \\
14. & c & 136,274 \\
15. & m & 119,708 \\
16. & p & 115,034 \\
17. & u & 101,905 \\
18. & l & 93,154 \\
19. & j & 91,254 \\
20. & ł & 68,754 \\
21. & b & 65,315 \\
22. & g & 48,933 \\
23. & ę & 47,559 \\
24. & ż & 38,306 \\
25. & ą & 36,188 \\
26. & h & 31,129 \\
27. & ś & 26,612 \\
28. & ć & 23,930 \\
29. & ó & 21,592 \\
30. & f & 15,770 \\
31. & P & 10,367 \\
32. & O & 8,737 \\
33. & N & 8,353 \\


## Overlapping Phonemes

In [38]:
shared_phonemes = character_sets[0] & character_sets[1]
shared_supports = {
    c: (character_supports[0][c] if c in character_supports[0] else 0) + (character_supports[1][c] if c in character_supports[1] else 0)
    for c in shared_phonemes
}

In [39]:
print(len(shared_phonemes))
for ci, c in enumerate(sorted(shared_phonemes, key=lambda c: shared_supports[c], reverse=True)):
    print(f'{ci+1}. & {c} & {shared_supports[c]:,d} \\\\')

95
1. &   & 2,021,030 \\
2. & a & 383,667 \\
3. & e & 356,988 \\
4. & i & 315,552 \\
5. & o & 308,988 \\
6. & n & 229,202 \\
7. & z & 207,384 \\
8. & . & 198,324 \\
9. & t & 174,310 \\
10. & , & 170,092 \\
11. & r & 168,277 \\
12. & s & 157,622 \\
13. & w & 150,038 \\
14. & y & 146,260 \\
15. & k & 145,498 \\
16. & d & 143,165 \\
17. & c & 138,249 \\
18. & m & 121,620 \\
19. & p & 116,693 \\
20. & u & 103,853 \\
21. & l & 98,401 \\
22. & j & 91,347 \\
23. & b & 66,175 \\
24. & ! & 52,275 \\
25. & g & 50,124 \\
26. & h & 34,457 \\
27. & - & 26,341 \\
28. & ó & 21,598 \\
29. & 0 & 20,616 \\
30. & 1 & 20,179 \\
31. & ) & 18,489 \\
32. & ( & 17,478 \\
33. & f & 16,853 \\
34. & 4 & 16,361 \\
35. & 2 & 15,238 \\
36. & " & 14,264 \\
37. & 5 & 11,644 \\
38. & P & 10,554 \\
39. & S & 9,497 \\
40. & 6 & 9,374 \\
41. & 3 & 8,919 \\
42. & O & 8,893 \\
43. & N & 8,476 \\
44. & 8 & 7,558 \\
45. & L & 7,447 \\
46. & A & 7,021 \\
47. & M & 6,630 \\
48. & D & 6,533 \\
49. & 7 & 6,477 \\
50. & W & 6,382