**Bu notebookta train.py için huggingface de bulunan bir türkçe şiirler veri setini indirip tokenize edip train.py için hazır hale getiriyoruz.**

In [1]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset 

In [2]:
enc = tiktoken.encoding_for_model("gpt-4o")

In [3]:
dataset = load_dataset('beratcmn/turkish-poems-cleaned')

In [4]:
unwanted_columns = ['title', 'rating', 'id', 'poet']
split_dataset = dataset['train'].train_test_split(test_size=0.25, seed=1337)
split_dataset = split_dataset.remove_columns(unwanted_columns)
split_dataset['val'] = split_dataset.pop('test') # test splitinin ismini val olarak değiştiriyoruz
split_dataset

DatasetDict({
    train: Dataset({
        features: ['poem'],
        num_rows: 3720
    })
    val: Dataset({
        features: ['poem'],
        num_rows: 1241
    })
})

In [5]:
def process(example):
        ids = enc.encode_ordinary(example['poem'])
        ids.append(enc.eot_token)
        out = {'ids': ids, 'len': len(ids)}
        return out

In [6]:
tokenized = split_dataset.map(
        process,
        remove_columns=['poem'],
        desc="tokenizing"
    )

In [7]:
tokenized['train'][0]['ids']

[9146,
 77,
 400,
 5872,
 2484,
 90069,
 445,
 22215,
 198,
 9146,
 2484,
 90069,
 445,
 22215,
 57453,
 132929,
 412,
 13518,
 2144,
 67617,
 89,
 142256,
 270,
 3163,
 84000,
 198,
 64220,
 2730,
 10847,
 51931,
 572,
 1916,
 817,
 1916,
 159385,
 279,
 44,
 1220,
 4356,
 72,
 293,
 25173,
 448,
 13140,
 10709,
 198,
 2223,
 13140,
 10709,
 15587,
 11,
 13739,
 13739,
 1305,
 190439,
 61070,
 198,
 10162,
 573,
 1431,
 3742,
 859,
 7666,
 612,
 809,
 188523,
 10709,
 412,
 33,
 3966,
 10709,
 3314,
 28182,
 36855,
 177027,
 143325,
 158563,
 75,
 1842,
 279,
 41750,
 2717,
 11744,
 5781,
 24216,
 572,
 412,
 44,
 407,
 3403,
 4091,
 178281,
 28690,
 1678,
 4692,
 198,
 4599,
 149296,
 3087,
 6782,
 68534,
 27339,
 28314,
 2832,
 198,
 42,
 22245,
 61994,
 514,
 199999]

In [8]:
for split, dset in tokenized.items():
        arr_len = np.sum(dset['len'], dtype=np.uint64)
        filename = f'{split}.bin'
        dtype = np.uint32
        arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
        total_batches = 128

        idx = 0
        for batch_idx in tqdm(range(total_batches), desc=f'{filename} oluşturuluyor...'):
            batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
            arr_batch = np.concatenate(batch['ids'])
            arr[idx : idx + len(arr_batch)] = arr_batch
            idx += len(arr_batch)
        arr.flush()

train.bin oluşturuluyor...: 100%|██████████| 128/128 [00:00<00:00, 857.77it/s]
val.bin oluşturuluyor...: 100%|██████████| 128/128 [00:00<00:00, 972.77it/s] 
