In [1]:
import json
import os


from datasets import load_dataset, concatenate_datasets

from config import load_config
from util.parsing import get_features
from util.windowing import make_random_window

  from .autonotebook import tqdm as notebook_tqdm


## Get and Load Config and Datasets

In [2]:
configs = []
cfg = load_config(configs)
with open(cfg['mappings'], 'r') as fp:
    phoneme_mappings = json.load(fp)

print(phoneme_mappings)

{'mappings': {'ɒ': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 25, 27, -28], 'ɑ': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 25, 27, -28], 'ɶ': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 25, 26, -28], 'a': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 25, -28], 'æ': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 25, 26, -28], 'ʌ': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 27], 'ɔ': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 27], 'o': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 27, 28], 'ɤ': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 27, 28], 'ɘ': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 28], 'œ': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 26], 'ə': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23], 'e': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 26, 28], 'ɞ': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23], 'ø': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 26, 28], 'ɛ': [1, 5, 6, -7, 8, 12, -19, -20, -21, 23, 26], 'ɵ': [1, 5, 6, -7, 8, 12, 15, 16, -19, -20, -21, 23, 28], 'ɯ': [1, 5, 6, -7, 8, 1

In [3]:
rd = load_dataset(cfg['datasets']['russian'])
pd = load_dataset(cfg['datasets']['polish'])

## Generate Windows

In [None]:
def preprocess(example):
    sentence = example['text']
    phoneme_sentence = example['text-phoneme']

    window_settings = cfg['windows']
    min_window_length = window_settings['min_size']
    max_window_length = window_settings['max_size']
    window_length_decay = window_settings['size_decay']

    """
    TODO
    because the phonemizer is word-level, you can use spaces to create boundaries for your windows
    by doing that you can create windows for both the phonetic and the orthographic columns.

    I'll leave it up to you as to how you want to encode the data into the dataset, but I'll tell you we need:
    1. The window start and stop characters
    2. The phonological features present in the span
    3. You need a window for both the orthographic column and the phonetic column
        * If you want to use the same window that's fine, but then the window offsets need to be word-based instead of character based.
        * You can see my implementation of the question answering datasets to see how you can use dicts in dataset columns: https://huggingface.co/datasets/iggy12345/sberquad-ipa (specifically see `answers` and `answers-phoneme`)
    """

    result = {}

    # Skip too short sentences
    if min(max_window_length, len(phoneme_sentence)) <= min_window_length:
        return {
            'windows': {'start': [], 'end': []},
            'windows-phoneme': {'start': [], 'end': []},
            'features': []
        }
    
    start_phoneme_char, end_phoneme_char = make_random_window(
        phoneme_sentence,
        max_window_length,
        min_window_length,
        window_length_decay,
        set(phoneme_mappings["mappings"].keys())
    )

    phoneme_sentence_window = phoneme_sentence[start_phoneme_char:end_phoneme_char+1]
    spaces_before = phoneme_sentence[0:start_phoneme_char].count(' ')
    spaces_within = phoneme_sentence[start_phoneme_char:end_phoneme_char+1].count(' ')

    start_word_idx = spaces_before
    words_in_window = spaces_within + 1
    end_word_idx = start_word_idx + words_in_window - 1

    words = sentence.split(' ')
    feature_results = get_features(phoneme_sentence_window, phoneme_mappings["mappings"])

    start_character_position = sum(len(w) for w in words[0:start_word_idx]) + start_word_idx
    end_character_position = sum(len(w) for w in words[start_word_idx:end_word_idx+1]) + (end_word_idx - start_word_idx) + start_character_position - 1

    result = {
        'windows': {
            'start': [start_character_position],
            'end': [end_character_position]
        },
        'windows-phoneme': {
            'start': [start_phoneme_char],
            'end': [end_phoneme_char]
        },
        'features': [feature_results]
    }
    return result

In [5]:
keep_columns = ['text', 'text-phoneme']

import time
timestamp = int(time.time())

rd_sampled = rd.map(
    preprocess,
    remove_columns=[c for c in rd['train'].column_names if c not in keep_columns],
    cache_file_names={
        'train': f'C:/Users/wayne/.cache/huggingface/datasets/cache_rd_train_{timestamp}.arrow',
        'validation': f'C:/Users/wayne/.cache/huggingface/datasets/cache_rd_val_{timestamp}.arrow',
        'test': f'C:/Users/wayne/.cache/huggingface/datasets/cache_rd_test_{timestamp}.arrow'
    },
    load_from_cache_file=False
)

timestamp = int(time.time())

pd_sampled = pd.map(
    preprocess,
    remove_columns=[c for c in pd['train'].column_names if c not in keep_columns],
    cache_file_names={
        'train': f'C:/Users/wayne/.cache/huggingface/datasets/cache_pd_train_{timestamp}.arrow',
        'validation': f'C:/Users/wayne/.cache/huggingface/datasets/cache_pd_val_{timestamp}.arrow',
        'test': f'C:/Users/wayne/.cache/huggingface/datasets/cache_pd_test_{timestamp}.arrow'
    },
    load_from_cache_file=False
)

Map: 100%|██████████| 45000/45000 [00:04<00:00, 11024.42 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 11424.17 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 11892.90 examples/s]
Map: 100%|██████████| 9577/9577 [00:00<00:00, 10705.10 examples/s]
Map: 100%|██████████| 1006/1006 [00:00<00:00, 9840.78 examples/s]
Map: 100%|██████████| 1002/1002 [00:00<00:00, 9717.68 examples/s]


# Concatenate Datasets

In [6]:
combined_ds = {}
for split in rd_sampled.keys():
    combined_ds[split] = concatenate_datasets([rd_sampled[split], pd_sampled[split]])

from datasets import DatasetDict
combined_ds = DatasetDict(combined_ds)


### Check dataset

In [7]:
print("Checking first 10 examples:")
for i in range(10):
    ex = combined_ds['train'][i]
    if ex['windows']['start']:
        phon_window = ex['text-phoneme'][ex['windows-phoneme']['start'][0]:ex['windows-phoneme']['end'][0]+1]
        print(f"Example {i}: phoneme_window='{phon_window}' -> features={ex['features'][0]}")

Checking first 10 examples:
Example 0: phoneme_window='otʃ' -> features=[1, 4, 5, 6, 7, 8, 12, 15, 16, 18, 20, 21, 23, 27, 28, -26, -25, -24, -19]
Example 1: phoneme_window='color of' -> features=[1, 4, 5, 6, 7, 8, 10, 12, 15, 16, 17, 18, 19, 20, 22, 23, 24, 26, 27, 28, -25, -21]
Example 2: phoneme_window='tsvʲˈet' -> features=[1, 4, 5, 6, 7, 8, 12, 15, 17, 18, 19, 21, 23, 26, 28, -27, -25, -24, -20]
Example 3: phoneme_window='nˈɑ' -> features=[1, 4, -28, 5, -26, -24, 6, 8, 11, 12, -21, -20, 18, 19, 23, -7, 27, 25]
Example 4: phoneme_window='erʲis' -> features=[1, 4, -27, 6, -25, 5, 8, 10, 7, 12, -20, 18, 19, 21, 23, 24, 26, 28]
Example 5: phoneme_window='iˈɑɭnyjɪ' -> features=[1, 4, 5, 6, 8, 11, 12, 15, 16, 18, 19, 22, 23, 24, 25, 26, 27, 28, -21, -20, -7]
Example 6: phoneme_window='ɑpka pʌɭʌv' -> features=[1, 4, 5, 6, 7, 8, 12, 15, 17, 18, 22, 23, 24, 25, 27, -28, -26, -21, -20, -19]
Example 7: phoneme_window='ɑmʌm d' -> features=[1, 4, 5, 6, 8, 11, 12, 15, 18, 19, 23, 25, 27, -28, -

## Push to Huggingface

In [8]:
# TODO if you get here, you're done! I'll handle pushing the dataset to the cloud

target_dataset_name = 'rus-pol-edge-probing-phono-feats'
username = 'iggy12345'

In [9]:
combined_ds.push_to_hub(f'{username}/{target_dataset_name}')

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-691159ff-1cafa44377cd59732e6c56c3;8b5741d7-2f33-43e0-8fd5-a6447a2635bd)

Invalid username or password.