In [11]:
import json
import pathlib

from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

In [12]:
mapping_path = pathlib.Path('data') / 'mappings.json'

dataset_names = [
    'iggy12345/ru-reviews-classification-ipa',
    'iggy12345/allegro-reviews-ipa'
]

# Inventory Analysis
Now I want to see which phonemes are in each language and which are shared

In [13]:
datasets = list(map(load_dataset, dataset_names))

In [14]:
def collect_characters(ds, split: str, current_set: dict):
    for row in tqdm(ds[split]):
        for ci, c in enumerate(row['text']):
            if c not in current_set:
                current_set[c] = 1
            else:
                current_set[c] += 1

character_sets = [
    {},
    {},
]
for di, dataset in enumerate(datasets):
    collect_characters(dataset, 'train', character_sets[di])
    collect_characters(dataset, 'validation', character_sets[di])

print(character_sets[0])
print(character_sets[1])


100%|██████████| 45000/45000 [00:01<00:00, 30538.93it/s]
100%|██████████| 15000/15000 [00:00<00:00, 32560.70it/s]
100%|██████████| 9577/9577 [00:00<00:00, 20904.90it/s]
100%|██████████| 1002/1002 [00:00<00:00, 20824.07it/s]

{'в': 249958, 'с': 277455, 'ё': 14720, ' ': 1186632, 'п': 185282, 'р': 331208, 'и': 343567, 'ш': 78754, 'л': 274678, 'о': 680607, 'а': 607920, 'б': 85880, '.': 140602, 'т': 408315, 'ь': 127269, 'к': 258072, 'н': 401294, 'е': 526318, 'м': 151832, 'г': 67522, 'ч': 103169, 'я': 109476, 'ж': 45853, 'д': 165773, '\n': 8004, 'у': 145924, 'T': 559, 'h': 3328, 'e': 9084, 'c': 1975, 'o': 5361, 'l': 5247, 'r': 4438, 'f': 1083, 't': 6789, 'd': 2899, 's': 6281, 'i': 4919, 'n': 3885, 'a': 4659, 'm': 1912, 'p': 1659, 'u': 1948, 'H': 86, 'w': 932, 'g': 1191, 'I': 614, 'k': 805, 'q': 200, 'y': 1537, 'З': 7477, 'з': 123593, 'ы': 97476, 'й': 57350, 'ц': 35261, ',': 127128, '!': 41334, 'П': 17553, '"': 1910, 'А': 6181, 'х': 42547, 'Н': 19284, 'щ': 17765, 'О': 15891, 'ю': 30975, 'ф': 14673, '3': 5894, '1': 15703, 'Ч': 1417, 'К': 11130, '-': 18040, '(': 11202, 'э': 13699, 'Р': 8278, '4': 14406, '6': 8210, 'Т': 11111, 'Э': 1357, 'Б': 4471, '5': 8455, '0': 13395, '2': 11178, 'x': 2768, 'С': 11211, 'М': 8287,




Lets filter datasets to get rid of these extra characters

In [15]:
# with open(mapping_path, 'r') as f:
#     mappings = json.load(f)

In [16]:
# real_ipa_chars = set(mappings['mappings'].keys())

In [17]:
character_supports = character_sets
character_sets = [set(sup.keys()) for sup in character_supports]

In [18]:
print(character_sets[0])
print(character_sets[1])

{'➕', 'D', 'і', 'є', '★', '3', '(', 'Ç', '÷', 'ф', 'M', '｀', 'é', 't', 'ё', 'b', 'n', 'ş', '➖', 'з', '❣', 'I', '^', 'g', 'П', 'х', '\\', 'V', '4', '☆', 'ц', 'í', 'ы', 'R', 'ω', '‼', 'f', 'ﾉ', 'Ё', '9', 'w', 'Л', 'z', 'п', '⛔', '}', '?', '!', '№', '✩', 'у', 'Д', 'й', 'Ы', 'Н', '0', '×', '✔', 's', 'а', '>', '（', '«', 'Q', '₽', '/', 'l', ' ', '-', 'Я', 'C', '+', 'j', '％', 'E', 'ъ', 'A', 'ú', '℅', 'я', 'Ъ', 'ı', 'м', '́', 'җ', '，', ']', 'K', 'Y', '“', 'У', '•', 'с', 'q', 'З', 'щ', 'S', 'ж', "'", '）', 'Z', 'ß', 'С', '—', '*', 'о', 'л', '`', 'a', 'ö', 'r', '≈', 'Э', 'ñ', 'Г', '_', 'F', '❗', '∀', 'К', 'O', 'i', 'р', 'Е', '|', '\n', 'x', 'ў', 'B', 'Ю', '⚘', 'Ш', '&', '1', 'н', 'в', 'И', '…', 'e', '❤', 'б', 'Ч', ';', 'ó', 'ü', '⭐', '◡', '=', 'u', 'p', '？', 'm', 'ч', '<', 'Ж', 'ю', 'Т', '。', ':', 'Й', 'P', 'k', 'T', 'Р', 'á', 'c', '–', '☺', 'Ь', 'N', '♥', 'э', '%', '@', '[', '’', 'ヽ', 'ä', '°', 'h', 'W', 'и', 'H', 'J', '6', 'v', '♡', 'М', 'o', 'е', 'X', 'L', ',', 'т', 'А', '8', '$', '”', 'y', '️

## Disjoint phonemes

In [19]:
russian_phonemes = character_sets[0] - character_sets[1]
polish_phonemes = character_sets[1] - character_sets[0]

In [40]:
print(russian_phonemes)
print(len(russian_phonemes))
for ci, c in enumerate(sorted(russian_phonemes, key=lambda c: character_supports[0][c], reverse=True)):
    print(f'{ci+1}. & \ipa{{{c}}} & {character_supports[0][c]:,d} \\\\')

{'➕', 'і', 'є', '★', 'Ç', '÷', 'ф', '｀', 'é', 'ё', 'ş', '➖', 'з', '❣', 'П', 'х', '☆', 'ц', 'í', 'ы', 'ω', '‼', 'ﾉ', 'Ё', '⛔', 'Л', 'п', '№', '✩', 'у', 'Д', 'й', 'Ы', 'Н', '×', '✔', 'а', '>', '（', '«', '₽', 'Я', '％', 'ъ', 'ú', '℅', 'я', 'Ъ', 'ı', 'м', '́', 'җ', '，', '“', 'У', '•', 'с', 'З', 'щ', 'ж', '）', 'ß', 'С', 'о', 'л', '`', 'ö', '≈', 'Э', 'ñ', 'Г', '❗', '∀', 'К', 'р', 'Е', '\n', 'ў', 'Ю', '⚘', 'Ш', 'н', 'в', 'И', '…', '❤', 'б', 'Ч', 'ü', '⭐', '◡', '？', 'ч', '<', 'Ж', 'ю', 'Т', '。', 'Й', 'Р', 'á', '☺', 'Ь', '♥', 'э', 'ヽ', '’', '°', 'и', '♡', 'М', 'е', 'т', 'А', 'ë', '️', 'Ф', '´', 'ь', '~', 'Б', '☹', 'Щ', 'О', 'д', '✌', '‘', 'Х', '❌', '»', '¡', '－', 'к', 'В', 'Ц', '❄', 'ç', 'ш', 'г', '©', '☝', '！', '✨'}
143
1. & \ipa{о} & 680,607 \\
2. & \ipa{а} & 607,920 \\
3. & \ipa{е} & 526,318 \\
4. & \ipa{т} & 408,315 \\
5. & \ipa{н} & 401,294 \\
6. & \ipa{и} & 343,567 \\
7. & \ipa{р} & 331,208 \\
8. & \ipa{с} & 277,455 \\
9. & \ipa{л} & 274,678 \\
10. & \ipa{к} & 258,072 \\
11. & \ipa{в} & 24

In [37]:
print(polish_phonemes)
print(len(polish_phonemes))
for ci, c in enumerate(sorted(polish_phonemes, key=lambda c: character_supports[1][c], reverse=True)):
    print(f'{ci+1}. & {c} & {character_supports[1][c]:,d} \\\\')

{'ń', 'ź', 'ś', 'Ż', 'Ó', 'ą', 'Ą', 'ż', 'Ę', 'Ź', 'Ć', '„', 'ł', 'ć', '{', 'ę', 'Ń', 'Ś', 'Ł'}
19
1. & ł & 68,754 \\
2. & ę & 47,559 \\
3. & ż & 38,306 \\
4. & ą & 36,188 \\
5. & ś & 26,612 \\
6. & ć & 23,930 \\
7. & ń & 3,640 \\
8. & ź & 2,643 \\
9. & Ł & 822 \\
10. & Ś & 560 \\
11. & Ż & 377 \\
12. & Ę & 316 \\
13. & Ą & 198 \\
14. & Ó & 127 \\
15. & Ć & 111 \\
16. & „ & 36 \\
17. & Ź & 28 \\
18. & Ń & 24 \\
19. & { & 4 \\


## Overlapping Phonemes

In [38]:
shared_phonemes = character_sets[0] & character_sets[1]
shared_supports = {
    c: (character_supports[0][c] if c in character_supports[0] else 0) + (character_supports[1][c] if c in character_supports[1] else 0)
    for c in shared_phonemes
}

In [39]:
print(len(shared_phonemes))
for ci, c in enumerate(sorted(shared_phonemes, key=lambda c: shared_supports[c], reverse=True)):
    print(f'{ci+1}. & {c} & {shared_supports[c]:,d} \\\\')

95
1. &   & 2,021,030 \\
2. & a & 383,667 \\
3. & e & 356,988 \\
4. & i & 315,552 \\
5. & o & 308,988 \\
6. & n & 229,202 \\
7. & z & 207,384 \\
8. & . & 198,324 \\
9. & t & 174,310 \\
10. & , & 170,092 \\
11. & r & 168,277 \\
12. & s & 157,622 \\
13. & w & 150,038 \\
14. & y & 146,260 \\
15. & k & 145,498 \\
16. & d & 143,165 \\
17. & c & 138,249 \\
18. & m & 121,620 \\
19. & p & 116,693 \\
20. & u & 103,853 \\
21. & l & 98,401 \\
22. & j & 91,347 \\
23. & b & 66,175 \\
24. & ! & 52,275 \\
25. & g & 50,124 \\
26. & h & 34,457 \\
27. & - & 26,341 \\
28. & ó & 21,598 \\
29. & 0 & 20,616 \\
30. & 1 & 20,179 \\
31. & ) & 18,489 \\
32. & ( & 17,478 \\
33. & f & 16,853 \\
34. & 4 & 16,361 \\
35. & 2 & 15,238 \\
36. & " & 14,264 \\
37. & 5 & 11,644 \\
38. & P & 10,554 \\
39. & S & 9,497 \\
40. & 6 & 9,374 \\
41. & 3 & 8,919 \\
42. & O & 8,893 \\
43. & N & 8,476 \\
44. & 8 & 7,558 \\
45. & L & 7,447 \\
46. & A & 7,021 \\
47. & M & 6,630 \\
48. & D & 6,533 \\
49. & 7 & 6,477 \\
50. & W & 6,382