In [1]:
import json
import pathlib

from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
database_path = pathlib.Path('references') / "Features.xlsx"
output_path = pathlib.Path('data') / 'mappings.json'

dataset_names = [
    'iggy12345/ru-reviews-classification-ipa',
    'iggy12345/allegro-reviews-ipa'
]

# Dataset Analysis
Because there are multiple unicode representations for IPA symbols, I want to find specifically the ones that we'll work with.

In [3]:
datasets = list(map(load_dataset, dataset_names))

In [4]:
characters = set()

def collect_characters(ds, split: str):
    for row in tqdm(ds[split]):
        for ci, c in enumerate(row['text-phoneme']):
            characters.add(c)

for dataset in datasets:
    collect_characters(dataset, 'train')
    collect_characters(dataset, 'validation')

print(characters)


100%|██████████| 45000/45000 [00:01<00:00, 34685.30it/s]
100%|██████████| 15000/15000 [00:00<00:00, 35303.89it/s]
100%|██████████| 9577/9577 [00:00<00:00, 22156.07it/s]
100%|██████████| 1002/1002 [00:00<00:00, 22761.92it/s]

{'?', 'в', '️', 'ë', '2', ',', 'с', 'ˌ', 'з', 'ɪ', 'р', 'N', 'А', 'Ó', '，', 'v', '&', 'ф', '❗', 'я', 'f', 's', '+', 'Е', '∀', 'ą', '。', 'ɛ', 'y', 'c', '“', '☆', 'п', 'Р', 'g', 'á', '℅', '¡', '◡', 'M', 'ę', 'ń', 'P', '*', '%', '!', 'н', ';', '~', 'z', 'ɨ', '！', 'д', 'Ч', 'ɹ', '{', 'О', 'Х', '❣', '‼', ' ', 'ʃ', '•', 'ś', '➖', 'ы', '✔', 'ç', ':', 'ŋ', 'U', 'Ж', 'Ł', 'ɔ', '„', '？', 'м', '6', '[', 'ə', 'Ь', '➕', 'ɡ', '✌', '`', '☝', 'Ы', '☹', '❌', 'ё', 'l', 'Л', 'ч', 'ц', '@', '_', 'K', 'Ц', '–', '№', 'ɑ', 'Б', 'К', '\n', 'í', '́', 'э', 'Я', 'j', '”', '－', 'о', 'щ', 'Z', 'ヽ', 'ş', 'ʒ', '^', 'L', 'и', 'Ф', 'И', '’', ')', 'у', 'В', 'З', '8', 'q', '×', '°', 'ю', 'n', 'ä', '(', '©', 'r', '％', 'D', 'Т', 'Q', '1', 'Д', 'ż', 'х', 'ı', 'ﾉ', 'ж', '|', 'ú', 'O', 'С', '$', '❄', 'М', 'ɣ', 'T', '✨', '—', 'ö', 'G', 'F', 'б', '<', 'ω', 'к', '₽', '-', '´', 'a', '（', '‘', '≈', 'ʂ', '7', '«', '⛔', 'г', '0', 'o', 'é', 'e', '3', '）', 'x', '⭐', 'й', '>', 'C', 'H', 'S', 't', 'Y', "'", 'Н', 'm', '♡', '4', 'd', 'Й'




In [5]:
df = pd.read_excel(database_path)
print(df.head())

  Unnamed: 0 syllabic stress long consonantal sonorant continuant  \
0          ɒ        +      -    -           -        +          +   
1          ɑ        +      -    -           -        +          +   
2          ɶ        +      -    -           -        +          +   
3          a        +      -    -           -        +          +   
4          æ        +      -    -           -        +          +   

  delayed release approximant tap  ... anterior distributed strident lateral  \
0               0           +   -  ...        0           0        0       -   
1               0           +   -  ...        0           0        0       -   
2               0           +   -  ...        0           0        0       -   
3               0           +   -  ...        0           0        0       -   
4               0           +   -  ...        0           0        0       -   

  DORSAL high low front back tense  
0      +    -   +     -    +     0  
1      +    -   +     -    +  

In [6]:
list(df.columns[1:])

['syllabic',
 'stress',
 'long',
 'consonantal',
 'sonorant',
 'continuant',
 'delayed release',
 'approximant',
 'tap',
 'trill',
 'nasal',
 'voice',
 'spread gl',
 'constr gl',
 'LABIAL',
 'round',
 'labiodental',
 'CORONAL',
 'anterior',
 'distributed',
 'strident',
 'lateral',
 'DORSAL',
 'high',
 'low',
 'front',
 'back',
 'tense']

In [7]:
result = {
    'mappings': {},
    'features': {}
}

In [8]:
for ci, col in enumerate(list(df.columns)[1:]):
    result['features'][col] = ci + 1

In [9]:
found = []
not_found = []
for ri, row in df.iterrows():
    features = []
    symbol = row['Unnamed: 0']
    symbol = symbol.replace('͡', '')
    if symbol not in characters:
        not_found.append(symbol)
    else:
        found.append(symbol)
    for col in result['features'].keys():
        feat = str(row[col])
        if feat == '+':
            features.append(result['features'][col])
        elif feat == '0':
            features.append(-result['features'][col])
    result['mappings'][symbol] = features

In [10]:
print(found)

['ɑ', 'a', 'ʌ', 'ɔ', 'o', 'ə', 'e', 'ɛ', 'ɵ', 'u', 'ɨ', 'y', 'i', 'ɪ', 'ɲ', 'ŋ', 'ɭ', 'r', 'n', 'm', 'l', 'q', 'ɕ', 'c', 'ç', 'ɣ', 'x', 'k', 'ɡ', 'ʑ', 'ʂ', 'ʒ', 'z', 'v', 't', 'ʃ', 's', 'p', 'f', 'd', 'b', 'w', 'j', 'ɹ', 'h']


In [11]:
print(not_found)

['ɒ', 'ɶ', 'æ', 'ɤ', 'ɘ', 'œ', 'ɞ', 'ø', 'ɯ', 'ʊ', 'ʉ', 'ʏ', 'ŋ+', 'ʟ', 'ɫ', 'ɴ', 'ʀ', 'ʎ', 'ŋ˗', 'ʟ', 'ʟ̠', 'ɳ', 'ʙ', 'ɺ', 'ɻ', 'ɽ', 'ɾ', 'ɱ', 'ʔ', 'ɣ+', 'x+', 'k+', 'ɡ+', 'k+x+', 'ɡ+ɣ+', 'ħ', 'ʕ', 'ʁ', 'χ', 'ɢ', 'ɉ', 'ʝ', 'dʑ', 'tɕ', 'ɣ̠ ', 'x̠', 'k̠', 'ɡ̠', 'ʈ', 'ɖ', 'ɬ', 'ʐ', 'ɸ', 'θ', 'ɮ', 'ð', 'β', 'dʒ', 'dz', 'dɮ', 'd̠ɮ̠', 'tʃ', 't̠ɬ̠', 'ts', 'tɬ', 't̪s̪', 't̪ɬ̪', 'd̪z̪', 'd̪ɮ̪', 'ʈʂ', 'ɖʐ', 'pf', 'bv', 'pɸ', 'bβ', 't̪θ', 'd̪ð', 'cç', 'ɉʝ', 'kx', 'k̠x̠', 'ɡɣ', 'ɡ̠̠ɣ̠', 'qχ', 'ɢʁ', 'ɧ', 'kp', 'gb', 'pt', 'bd', 'ɰ', 'ɰ̠', 'ɥ', 'ʋ', 'ʍ', 'ɦ']


In [12]:
available_symbols = set(found + not_found)
not_fulfilled = [c for c in characters if c not in available_symbols]
print(not_fulfilled)

['?', 'в', '️', 'ë', '2', ',', 'с', 'ˌ', 'з', 'р', 'N', 'А', 'Ó', '，', '&', 'ф', '❗', 'я', '+', 'Е', '∀', 'ą', '。', '“', '☆', 'п', 'Р', 'g', 'á', '℅', '¡', '◡', 'M', 'ę', 'ń', 'P', '*', '%', '!', 'н', ';', '~', '！', 'д', 'Ч', '{', 'О', 'Х', '❣', '‼', ' ', '•', 'ś', '➖', 'ы', '✔', ':', 'U', 'Ж', 'Ł', '„', '？', 'м', '6', '[', 'Ь', '➕', '✌', '`', '☝', 'Ы', '☹', '❌', 'ё', 'Л', 'ч', 'ц', '@', '_', 'K', 'Ц', '–', '№', 'Б', 'К', '\n', 'í', '́', 'э', 'Я', '”', '－', 'о', 'щ', 'Z', 'ヽ', 'ş', '^', 'L', 'и', 'Ф', 'И', '’', ')', 'у', 'В', 'З', '8', '×', '°', 'ю', 'ä', '(', '©', '％', 'D', 'Т', 'Q', '1', 'Д', 'ż', 'х', 'ı', 'ﾉ', 'ж', '|', 'ú', 'O', 'С', '$', '❄', 'М', 'T', '✨', '—', 'ö', 'G', 'F', 'б', '<', 'ω', 'к', '₽', '-', '´', '（', '‘', '≈', '7', '«', '⛔', 'г', '0', 'é', '3', '）', '⭐', 'й', '>', 'C', 'H', 'S', 'Y', "'", 'Н', '♡', '4', 'Й', 'J', '#', '❤', 'ß', 'Ç', 'ш', '"', '5', '☺', 'ü', '9', 'ó', '…', '★', 'B', 'П', '̃', 'ć', 'Ю', 'т', 'У', 'л', '»', 'W', ']', '/', 'ˈ', 'V', 'R', '.', 'ъ', 'ь'

We'll have to do some fancy parsing to handle diphthongs, but it looks like the parsing is working correctly and the unicode used for the characters matches the unicode in the datasets themselves.

In [13]:
with open(output_path, 'w+') as f:
    json.dump(result, f, indent=4, ensure_ascii=False)