# Data Prep

Load just the audio book ('AB') data for the three major dialects.

## Load Data

In [1]:
from datasets import load_dataset

utsang = load_dataset('openpecha/stt-training-data', split='train')

utsang

Dataset({
    features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
    num_rows: 1362015
})

In [2]:
others = load_dataset('ganga4364/stt_tibetan_dialects_data',split='train')

others

Dataset({
    features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
    num_rows: 112870
})

In [3]:
set(utsang['dept'])

{'STT_AB',
 'STT_CS',
 'STT_HS',
 'STT_MV',
 'STT_NS',
 'STT_NW',
 'STT_PC',
 'STT_TT'}

In [4]:
set(others['dept'])

{'STT_AM', 'STT_AM_AB', 'STT_KH', 'STT_KH_AB'}

## Filter for Only Audio Book Data

In [5]:
utsang_ab = utsang.filter(lambda x: x["dept"] == "STT_AB")

In [6]:
amdo_ab = others.filter(lambda x: x["dept"] == "STT_AM_AB")

In [7]:
kham_ab = others.filter(lambda x: x["dept"] == "STT_KH_AB")

## Generate Splits

In [8]:
utsang_subset = utsang_ab.shuffle(seed=42).select(range(15_000)).train_test_split(.15)
amdo_subset = amdo_ab.shuffle(seed=42).select(range(15_000)).train_test_split(.15)
kham_subset = kham_ab.shuffle(seed=42).select(range(15_000)).train_test_split(.15)

## Create Mixed Data

In [9]:
from datasets import concatenate_datasets

mixed = concatenate_datasets([utsang_subset['train'].shuffle().select(range(5000)), 
                              amdo_subset['train'].shuffle().select(range(5000)), 
                              kham_subset['train'].shuffle().select(range(5000))])

mixed

Dataset({
    features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len', '__index_level_0__'],
    num_rows: 15000
})

## Save Data

In [10]:
utsang_subset.save_to_disk('utsang-ds')
amdo_subset.save_to_disk('amdo-ds')
kham_subset.save_to_disk('kham-ds')
mixed.save_to_disk('mixed-training-ds')

Saving the dataset (0/1 shards):   0%|          | 0/12750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/15000 [00:00<?, ? examples/s]