# Data Prep

Load just the audio book ('AB') data for the three major dialects.

## Load Data

In [1]:
from datasets import load_dataset

ds = load_dataset('openpecha/stt-training-data', split='train')

ds

Dataset({
    features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
    num_rows: 1362015
})

In [2]:
set(ds['dept'])

{'STT_AB',
 'STT_CS',
 'STT_HS',
 'STT_MV',
 'STT_NS',
 'STT_NW',
 'STT_PC',
 'STT_TT'}

## Filter for Only Teachings Data With Non-Empty Transcriptions

In [3]:
ds_tt = ds.filter(lambda x: x["dept"] == "STT_TT" and x['uni'].strip() != '')

Filter:   0%|          | 0/1362015 [00:00<?, ? examples/s]

## Generate Splits

In [9]:
from datasets import DatasetDict

total_len = len(ds_tt)
test_size = int(total_len * 0.10)
dev_size = int(total_len * 0.05)
train_size = total_len - dev_size - test_size

train = ds_tt.select(range(train_size))
dev = ds_tt.select(range(train_size, train_size + dev_size))
test = ds_tt.select(range(train_size + dev_size, total_len))

ds_dict = DatasetDict()
ds_dict['train'] = train
ds_dict['dev'] = dev
ds_dict['test'] = test

ds_dict


DatasetDict({
    train: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
        num_rows: 379522
    })
    dev: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
        num_rows: 22324
    })
    test: Dataset({
        features: ['file_name', 'uni', 'wylie', 'url', 'dept', 'grade', 'char_len', 'audio_len'],
        num_rows: 44649
    })
})

## Save Data

In [14]:
ds_dict.save_to_disk('tt-asr-ds')

Saving the dataset (0/1 shards):   0%|          | 0/379522 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/22324 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/44649 [00:00<?, ? examples/s]