# imports

In [1]:
import sys
sys.path.append('../src/')
import json
import os
from tqdm import tqdm
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from sentencepiece import SentencePieceProcessor
from functools import partial

# paths

In [2]:
data = '../llama.c-/data/TinyStories_all_data/'
#ta = '/home/cindy/learning/karpathy/llms/llama2.c/data/TinyStories_all_data/'

In [3]:
tokenizer_path = '../tokenizer.model'

In [4]:
out_data = '../instruct_dataset/'
os.makedirs(out_data, exist_ok=True)

# preprocessing functions

In [5]:
def replace_words(
    prompt, 
    words_list
):
    for words_to_remove, replacement in words_list:
        prompt = prompt.replace(words_to_remove, replacement)
    return prompt
        

In [6]:
def preprocess_prompt(prompt, suffix=' Possible story:'):
    words_list = [
    ("Write a short story (3-5 paragraphs) which only uses very simple words that a 3 year old child would understand.", "Write a story."),
    (" Remember to only use simple words!", ""),
    ("\n\nPossible story:", ""),
    ("try to at some point use", "try to use")
    ]
    prompt = replace_words(prompt, words_list)
    return prompt + suffix

# tokenize and save

In [7]:
def save(out_folder, filename, data_to_save):
    filepath = os.path.join(out_folder, filename)
    print(out_folder, filename, data_to_save.shape)
    with open(filepath, "wb") as f:
        f.write(data_to_save.tobytes())
    print('Saved to ', filepath)

In [8]:
def tokenize_chunk(chunk_path, out_folder, tokenizer, max_seq_len, pad_token):
    with open(chunk_path, 'r') as f:
        chunk = json.load(f)
        print('Tokenize ', chunk_path)
    all_tokens = []
    all_labels = []
    for sample in tqdm(chunk):
        story = sample['story'].strip()
        prompt = preprocess_prompt(sample['instruction']['prompt:'].strip())
        tokenized_prompt = tokenizer.encode(prompt)
        
        prompt_and_story = tokenized_prompt + [tokenizer.bos_id()] + tokenizer.encode(story) + [tokenizer.eos_id()]
        label = [pad_token]*len(tokenized_prompt) + [tokenizer.bos_id()] + tokenizer.encode(story) + [tokenizer.eos_id()]

        if len(prompt_and_story) <= max_seq_len:
            prompt_and_story += [pad_token] * (max_seq_len - len(prompt_and_story))
            label += [pad_token] * (max_seq_len - len(label))
            assert len(prompt_and_story) == len(label) == max_seq_len
            all_tokens.extend(prompt_and_story)
            all_labels.extend(label)

    all_tokens = np.array(all_tokens, dtype=np.int16)
    all_labels = np.array(all_labels, dtype=np.int16)
    
    all_tokens_filename = chunk_path.split('/')[-1].replace('.json', '.bin')
    save(out_folder=out_folder, filename=all_tokens_filename, data_to_save=all_tokens)

    all_labels_filename = all_tokens_filename.replace('data', 'labels')
    save(out_folder=out_folder, filename=all_labels_filename, data_to_save=all_labels)

        
    
def tokenize_all_chunks(data, out_folder, tokenizer, max_seq_len, pad_token, max_workers=5):
    tokenize_chunk_fn = partial(
        tokenize_chunk,
        out_folder=out_folder,
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        pad_token=pad_token
    )
        
    tokenize_chunk_paths = [os.path.join(data, fn) for fn in os.listdir(data) if fn.endswith('.json')]
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        executor.map(tokenize_chunk_fn, tokenize_chunk_paths)
        
        

In [9]:
tokenizer = SentencePieceProcessor(model_file=tokenizer_path)

In [10]:
tokenize_all_chunks(
    data=data, 
    out_folder=out_data, 
    tokenizer=tokenizer, 
    max_seq_len=350,
    pad_token=-100
)

Tokenize  ../llama.c-/data/TinyStories_all_data/data34.json


  1%|          | 1164/100000 [00:01<01:33, 1060.84it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data07.json


  1%|▏         | 1277/100000 [00:01<01:31, 1079.22it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data04.json


  0%|          | 97/100000 [00:00<01:43, 968.18it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data22.json


  0%|          | 306/100000 [00:00<01:36, 1028.70it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data18.json


 88%|████████▊ | 87749/100000 [01:20<00:18, 671.23it/s]s]
 92%|█████████▏| 92155/100000 [01:23<00:08, 944.51it/s]]

../instruct_dataset/ data34.bin (32911200,)


 92%|█████████▏| 91766/100000 [01:24<00:08, 1019.47it/s]

Saved to  ../instruct_dataset/data34.bin
../instruct_dataset/ 

 96%|█████████▋| 96275/100000 [01:24<00:03, 1097.79it/s]

labels34.bin (32911200,)


 92%|█████████▏| 91884/100000 [01:24<00:07, 1065.64it/s]

Saved to  ../instruct_dataset/labels34.bin


 94%|█████████▎| 93600/100000 [01:24<00:05, 1270.71it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data30.json


100%|██████████| 100000/100000 [01:27<00:00, 1143.74it/s]
100%|██████████| 100000/100000 [01:27<00:00, 1142.57it/s]
 99%|█████████▉| 99003/100000 [01:30<00:01, 915.04it/s] 

../instruct_dataset/ data07.bin (32866050,)

  6%|▌         | 5742/100000 [00:05<01:25, 1096.63it/s]




 99%|█████████▉| 99097/100000 [01:30<00:00, 921.41it/s]

Saved to  ../instruct_dataset/data07.bin
../instruct_dataset/ labels07.bin (32866050,)


 99%|█████████▉| 99197/100000 [01:30<00:00, 943.54it/s]

Saved to  ../instruct_dataset/labels07.bin


 99%|█████████▉| 99292/100000 [01:30<00:00, 927.36it/s]

../instruct_dataset/ data22.bin (32851700,)


 99%|█████████▉| 99389/100000 [01:30<00:00, 939.86it/s]

Saved to 

 99%|█████████▉| 99233/100000 [01:30<00:00, 827.41it/s]

 ../instruct_dataset/data22.bin
../instruct_dataset/ labels22.bin (32851700,)


  6%|▌         | 6213/100000 [00:05<01:20, 1159.32it/s]

Saved to  ../instruct_dataset/labels22.bin


100%|██████████| 100000/100000 [01:31<00:00, 1097.38it/s]
100%|██████████| 100000/100000 [01:31<00:00, 1092.57it/s]
  7%|▋         | 7201/100000 [00:06<01:19, 1171.20it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data47.json


  0%|          | 231/100000 [00:00<01:25, 1166.65it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data15.json


  2%|▏         | 2241/100000 [00:01<01:29, 1096.62it/s]

../instruct_dataset/ data18.bin (32886000,)

 10%|▉         | 9776/100000 [00:08<01:12, 1243.80it/s]


Saved to  ../instruct_dataset/data18.bin
../instruct_dataset/ labels18.bin (32886000,)

  2%|▏         | 2352/100000 [00:02<01:29, 1090.35it/s]




 10%|▉         | 9901/100000 [00:08<01:14, 1216.84it/s]

../instruct_dataset/ data04.bin (32841550,)
Saved to  ../instruct_dataset/labels18.bin
Saved to  ../instruct_dataset/data04.bin
../instruct_dataset/ labels04.bin (32841550,)


 10%|█         | 10024/100000 [00:08<01:16, 1179.93it/s]

Saved to  ../instruct_dataset/labels04.bin


  3%|▎         | 3404/100000 [00:03<01:43, 929.37it/s]]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data39.json


  4%|▎         | 3518/100000 [00:03<01:37, 986.75it/s]]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data21.json


100%|██████████| 100000/100000 [01:34<00:00, 1053.40it/s]
100%|██████████| 100000/100000 [01:25<00:00, 1166.14it/s]
100%|██████████| 100000/100000 [01:29<00:00, 1119.65it/s]
 99%|█████████▊| 98624/100000 [01:31<00:01, 884.04it/s] 

../instruct_dataset/ data30.bin (32901750,)


 99%|█████████▊| 98682/100000 [01:28<00:01, 888.59it/s]

Saved to  

 99%|█████████▊| 98713/100000 [01:31<00:01, 856.82it/s]

../instruct_dataset/data30.bin
../instruct_dataset/ labels30.bin (32901750,)
Saved to 

 99%|█████████▉| 98799/100000 [01:32<00:01, 848.39it/s]

 

 99%|█████████▉| 98772/100000 [01:28<00:01, 840.13it/s]

../instruct_dataset/labels30.bin


 99%|█████████▉| 99060/100000 [01:29<00:01, 864.87it/s]

../instruct_dataset/ data39.bin (32886350,)


 99%|█████████▉| 99148/100000 [01:29<00:00, 854.29it/s]

Saved to  ../instruct_dataset/data39.bin
../instruct_dataset/ labels39.bin (32886350,)


 99%|█████████▉| 99214/100000 [01:32<00:00, 818.96it/s]

Saved to  ../instruct_dataset/labels39.bin


100%|█████████▉| 99511/100000 [01:29<00:00, 893.74it/s]

../instruct_dataset/ data15.bin (32904550,)


100%|█████████▉| 99570/100000 [01:32<00:00, 861.47it/s]

Saved to 

100%|█████████▉| 99606/100000 [01:29<00:00, 907.84it/s]

 ../instruct_dataset/data15.bin
../instruct_dataset/ labels15.bin (32904550,)
Saved to  ../instruct_dataset/labels15.bin


100%|██████████| 100000/100000 [01:30<00:00, 1110.42it/s]
100%|█████████▉| 99984/100000 [01:33<00:00, 999.94it/s]

Tokenize 

100%|██████████| 100000/100000 [01:33<00:00, 1070.90it/s]


 ../llama.c-/data/TinyStories_all_data/data17.json


  0%|          | 326/100000 [00:00<01:31, 1085.09it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data03.json


  0%|          | 373/100000 [00:00<01:23, 1197.39it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data01.json


  2%|▏         | 2417/100000 [00:02<01:34, 1037.05it/s]

../instruct_dataset/ data21.bin (32876200,)


  2%|▏         | 1993/100000 [00:01<01:22, 1184.76it/s]

Saved to  ../instruct_dataset/data21.bin../instruct_dataset/
../instruct_dataset/  data47.binlabels21.bin  (32881450,)(32876200,)



  2%|▎         | 2500/100000 [00:02<01:20, 1209.70it/s]

Saved to Saved to   ../instruct_dataset/labels21.bin../instruct_dataset/data47.bin

../instruct_dataset/ labels47.bin (32881450,)


  3%|▎         | 2624/100000 [00:02<01:37, 1002.62it/s]

Saved to  ../instruct_dataset/labels47.bin


  4%|▎         | 3714/100000 [00:03<01:19, 1213.68it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data36.json


  3%|▎         | 3295/100000 [00:02<01:21, 1184.00it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data14.json


100%|██████████| 100000/100000 [01:26<00:00, 1151.80it/s]
100%|██████████| 100000/100000 [01:27<00:00, 1144.27it/s]
 99%|█████████▉| 99435/100000 [01:25<00:00, 928.75it/s]s]
100%|██████████| 100000/100000 [01:26<00:00, 1158.16it/s]
 97%|█████████▋| 97089/100000 [01:26<00:03, 911.50it/s]

../instruct_dataset/ data17.bin (32890900,)


 97%|█████████▋| 97181/100000 [01:26<00:03, 884.82it/s]

Saved to  ../instruct_dataset/data17.bin
../instruct_dataset/ labels17.bin (32890900,)


 97%|█████████▋| 97270/100000 [01:26<00:03, 868.21it/s]

Saved to  ../instruct_dataset/labels17.bin


 98%|█████████▊| 98081/100000 [01:27<00:02, 902.31it/s]

../instruct_dataset/ data03.bin (32862900,)


 98%|█████████▊| 98172/100000 [01:27<00:02, 888.59it/s]

Saved to  ../instruct_dataset/data03.bin
../instruct_dataset/ labels03.bin (32862900,)


 98%|█████████▊| 98274/100000 [01:27<00:01, 923.76it/s]

Saved to  ../instruct_dataset/labels03.bin


 98%|█████████▊| 98382/100000 [01:28<00:01, 968.33it/s]

Tokenize  

 98%|█████████▊| 98492/100000 [01:28<00:01, 1006.05it/s]

../llama.c-/data/TinyStories_all_data/data06.json


 99%|█████████▉| 99179/100000 [01:28<00:00, 911.78it/s] 

../instruct_dataset/ data01.bin (32918550,)


 99%|█████████▉| 99277/100000 [01:28<00:00, 929.55it/s]

Saved to  ../instruct_dataset/data01.bin
../instruct_dataset/ labels01.bin Tokenize  (32918550,)../llama.c-/data/TinyStories_all_data/data28.json



  1%|          | 1023/100000 [00:00<01:32, 1075.51it/s]

Saved to 

 99%|█████████▉| 99375/100000 [01:29<00:00, 942.30it/s]

 ../instruct_dataset/labels01.bin


100%|█████████▉| 99583/100000 [01:29<00:00, 990.36it/s]

../instruct_dataset/ data14.bin (32941650,)


  1%|▏         | 1355/100000 [00:01<01:31, 1079.27it/s]

Saved to 

100%|█████████▉| 99683/100000 [01:29<00:00, 985.11it/s]

 ../instruct_dataset/data14.bin
../instruct_dataset/ labels14.bin (32941650,)


  0%|          | 430/100000 [00:00<01:38, 1007.37it/s]

Saved to  ../instruct_dataset/labels14.bin


100%|██████████| 100000/100000 [01:29<00:00, 1114.84it/s]
  2%|▏         | 2232/100000 [00:02<01:30, 1084.04it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data43.json


  3%|▎         | 2556/100000 [00:02<01:34, 1035.85it/s]

Tokenize 

  0%|          | 319/100000 [00:00<01:37, 1027.35it/s]

 ../llama.c-/data/TinyStories_all_data/data23.json


  2%|▏         | 1865/100000 [00:01<01:32, 1066.06it/s]

../instruct_dataset/ 

  2%|▏         | 1578/100000 [00:01<01:31, 1077.59it/s]

data36.bin (32906300,)


  4%|▍         | 4230/100000 [00:03<01:29, 1066.42it/s]

Saved to  ../instruct_dataset/data36.bin
../instruct_dataset/

  2%|▏         | 1972/100000 [00:01<01:33, 1052.26it/s]

 labels36.bin (32906300,)


  2%|▏         | 1697/100000 [00:01<01:28, 1108.81it/s]

Saved to  ../instruct_dataset/labels36.bin


  3%|▎         | 2905/100000 [00:02<01:17, 1244.97it/s]

Tokenize 

  5%|▌         | 5495/100000 [00:05<01:20, 1174.87it/s]

 ../llama.c-/data/TinyStories_all_data/data38.json


100%|██████████| 100000/100000 [01:27<00:00, 1136.60it/s]
100%|██████████| 100000/100000 [01:27<00:00, 1142.25it/s]
100%|██████████| 100000/100000 [01:26<00:00, 1161.38it/s]
100%|██████████| 100000/100000 [01:28<00:00, 1135.18it/s]
 97%|█████████▋| 96758/100000 [01:26<00:03, 845.18it/s]

../instruct_dataset/ data06.bin (32894750,)
Saved to  

 97%|█████████▋| 96843/100000 [01:26<00:03, 811.86it/s]

../instruct_dataset/data06.bin
../instruct_dataset/ labels06.bin (32894750,)
Saved to  

 97%|█████████▋| 96925/100000 [01:26<00:03, 808.43it/s]

../instruct_dataset/labels06.bin


 97%|█████████▋| 97007/100000 [01:26<00:03, 796.98it/s]

../instruct_dataset/ data23.bin 

 97%|█████████▋| 97087/100000 [01:26<00:03, 786.67it/s]

(32876200,)
Saved to  ../instruct_dataset/data23.bin
../instruct_dataset/

 97%|█████████▋| 97166/100000 [01:26<00:03, 786.36it/s]

 labels23.bin (32876200,)
../instruct_dataset/ data28.bin (32866750,)
Saved to  ../instruct_dataset/labels23.bin


 97%|█████████▋| 97247/100000 [01:26<00:03, 791.74it/s]

Saved to  ../instruct_dataset/data28.bin
../instruct_dataset/ labels28.bin (32866750,)


 97%|█████████▋| 97327/100000 [01:26<00:03, 780.43it/s]

Saved to  ../instruct_dataset/labels28.bin


 98%|█████████▊| 97893/100000 [01:27<00:02, 989.56it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data40.json


  0%|          | 222/100000 [00:00<01:30, 1098.51it/s]s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data10.json


  0%|          | 115/100000 [00:00<01:27, 1145.06it/s]s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data19.json


  0%|          | 230/100000 [00:00<01:29, 1120.29it/s]s]

../instruct_dataset/ 

  0%|          | 111/100000 [00:00<01:30, 1104.19it/s]

data43.bin (32898250,)
Saved to 

  1%|          | 546/100000 [00:00<01:35, 1039.21it/s]s]

 ../instruct_dataset/data43.bin
../instruct_dataset/ labels43.bin (32898250,)


  0%|          | 222/100000 [00:00<01:32, 1072.96it/s]

Saved to  ../instruct_dataset/labels43.bin


  1%|▏         | 1306/100000 [00:01<01:29, 1104.36it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data20.json


100%|██████████| 100000/100000 [01:29<00:00, 1118.27it/s]
  4%|▍         | 4395/100000 [00:03<01:21, 1178.88it/s]

../instruct_dataset/ data38.bin (32908400,)


  3%|▎         | 2907/100000 [00:02<01:30, 1067.46it/s]

Saved to  ../instruct_dataset/data38.bin
../instruct_dataset/ labels38.bin

  4%|▍         | 4477/100000 [00:04<01:26, 1109.21it/s]

 (32908400,)


  3%|▎         | 3018/100000 [00:02<01:30, 1077.14it/s]

Saved to  ../instruct_dataset/labels38.bin


  4%|▍         | 4032/100000 [00:03<01:26, 1105.69it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data29.json

  6%|▌         | 5578/100000 [00:05<01:27, 1084.27it/s]




100%|██████████| 100000/100000 [01:18<00:00, 1272.52it/s]
100%|██████████| 100000/100000 [01:21<00:00, 1228.33it/s]
 93%|█████████▎| 92799/100000 [01:16<00:06, 1076.30it/s]

../instruct_dataset/ data40.bin (32855200,)


 93%|█████████▎| 92907/100000 [01:16<00:06, 1062.51it/s]

Saved to  ../instruct_dataset/data40.bin
../instruct_dataset/ labels40.bin (32855200,)


 98%|█████████▊| 97704/100000 [01:20<00:02, 1038.42it/s]

Saved to 

 98%|█████████▊| 97958/100000 [01:21<00:01, 1095.45it/s]

 ../instruct_dataset/labels40.bin


 99%|█████████▉| 98834/100000 [01:21<00:00, 1243.55it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data11.json


100%|██████████| 100000/100000 [01:23<00:00, 1196.88it/s]
100%|██████████| 100000/100000 [01:22<00:00, 1213.70it/s]
  2%|▏         | 2251/100000 [00:01<01:13, 1330.82it/s]]

../instruct_dataset/

 96%|█████████▌| 95981/100000 [01:19<00:04, 990.98it/s] 

 data19.bin (32873750,)
Saved to  ../instruct_dataset/data19.bin
../instruct_dataset/ labels19.bin (32873750,)


 96%|█████████▌| 96120/100000 [01:19<00:03, 1103.02it/s]

Saved to  ../instruct_dataset/labels19.bin


 97%|█████████▋| 97326/100000 [01:20<00:02, 1208.86it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data46.json


  1%|          | 625/100000 [00:00<01:22, 1199.61it/s]s]

../instruct_dataset/ 

  5%|▍         | 4705/100000 [00:03<01:06, 1428.17it/s]

data10.bin (32901750,)
Saved to  ../instruct_dataset/data10.bin
../instruct_dataset/ 

 98%|█████████▊| 98066/100000 [01:21<00:01, 1144.25it/s]

labels10.bin 

  1%|          | 751/100000 [00:00<01:21, 1216.96it/s]

(32901750,)
../instruct_dataset/ data20.bin (32887400,)


  5%|▍         | 4848/100000 [00:03<01:07, 1418.21it/s]

Saved to  ../instruct_dataset/labels10.bin
Saved to  ../instruct_dataset/data20.bin
../instruct_dataset/ labels20.bin (32887400,)


  5%|▍         | 4991/100000 [00:03<01:06, 1420.48it/s]]

Saved to  ../instruct_dataset/labels20.bin


  6%|▋         | 6293/100000 [00:04<01:05, 1432.67it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data33.json


  6%|▋         | 6437/100000 [00:04<01:06, 1417.52it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data25.json


  7%|▋         | 7021/100000 [00:04<01:04, 1437.73it/s]s]
  5%|▌         | 5398/100000 [00:04<01:14, 1264.18it/s]]

../instruct_dataset/ data29.bin (32850650,)


 10%|█         | 10200/100000 [00:07<01:02, 1433.89it/s]

Saved to  ../instruct_dataset/data29.bin
../instruct_dataset/ labels29.bin (32850650,)


  3%|▎         | 3374/100000 [00:02<01:19, 1217.09it/s]

Saved to  ../instruct_dataset/labels29.bin


  7%|▋         | 6839/100000 [00:05<01:11, 1299.64it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data42.json


100%|██████████| 100000/100000 [01:13<00:00, 1352.93it/s]
 92%|█████████▏| 92023/100000 [01:14<00:07, 1050.11it/s]

../instruct_dataset/ data11.bin (32891950,)


 92%|█████████▏| 92129/100000 [01:14<00:07, 1049.84it/s]

Saved to  ../instruct_dataset/data11.bin
../instruct_dataset/ labels11.bin (32891950,)


 91%|█████████▏| 91375/100000 [01:13<00:07, 1142.20it/s]

Saved to  ../instruct_dataset/labels11.bin


 93%|█████████▎| 92574/100000 [01:14<00:06, 1203.47it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data13.json


100%|██████████| 100000/100000 [01:21<00:00, 1230.37it/s]
100%|██████████| 100000/100000 [01:19<00:00, 1251.04it/s]
 10%|█         | 10221/100000 [00:08<01:11, 1252.18it/s]

../instruct_dataset/ data46.bin (32927300,)


 10%|█         | 10355/100000 [00:08<01:10, 1276.98it/s]

Saved to  ../instruct_dataset/data46.bin
../instruct_dataset/ labels46.bin (32927300,)


 95%|█████████▌| 95346/100000 [01:18<00:05, 869.90it/s]

Saved to  ../instruct_dataset/labels46.bin


100%|██████████| 100000/100000 [01:22<00:00, 1210.60it/s]


../instruct_dataset/ data33.bin (32938500,)
Saved to  ../instruct_dataset/data33.bin

 96%|█████████▌| 95536/100000 [01:18<00:04, 900.65it/s]


../instruct_dataset/ labels33.bin (32938500,)


 11%|█         | 10741/100000 [00:08<01:10, 1260.57it/s]

Saved to  ../instruct_dataset/labels33.bin


 96%|█████████▌| 95980/100000 [01:19<00:03, 1083.56it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data49.json


 12%|█▏        | 12039/100000 [00:09<01:15, 1161.96it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data27.json


 13%|█▎        | 13331/100000 [00:10<01:13, 1181.33it/s]

../instruct_dataset/ data25.bin (32893700,)


  3%|▎         | 2271/67871 [00:01<00:56, 1152.76it/s]s]

Saved to 

  1%|▏         | 1427/100000 [00:01<01:22, 1192.60it/s]]

 ../instruct_dataset/data25.bin
../instruct_dataset/ labels25.bin (32893700,)


 98%|█████████▊| 98185/100000 [01:21<00:01, 1069.15it/s]

Saved to  ../instruct_dataset/labels25.bin


 99%|█████████▉| 99334/100000 [01:22<00:00, 1147.69it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data24.json


100%|██████████| 100000/100000 [01:22<00:00, 1206.03it/s]
 10%|█         | 7097/67871 [00:05<00:49, 1223.06it/s]]]

../instruct_dataset/ data42.bin (32911550,)


  6%|▋         | 6311/100000 [00:05<01:18, 1196.68it/s]

Saved to  ../instruct_dataset/data42.bin

 18%|█▊        | 18281/100000 [00:14<01:09, 1175.42it/s]


../instruct_dataset/ labels42.bin (32911550,)


 11%|█         | 7220/67871 [00:05<00:50, 1189.38it/s]]

Saved to  ../instruct_dataset/labels42.bin


  8%|▊         | 7657/100000 [00:06<01:16, 1200.07it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data32.json


100%|██████████| 67871/67871 [00:54<00:00, 1248.54it/s]]
 78%|███████▊  | 77646/100000 [01:05<00:23, 945.96it/s]]

../instruct_dataset/ data49.bin (22325100,)


 64%|██████▍   | 63798/100000 [00:53<00:36, 1003.90it/s]

Saved to 

 58%|█████▊    | 57660/100000 [00:49<00:45, 934.49it/s]

 ../instruct_dataset/data49.bin
../instruct_dataset/ labels49.bin (22325100,)


 65%|██████▌   | 65019/100000 [00:56<00:36, 971.58it/s]

Saved to  ../instruct_dataset/labels49.bin


 66%|██████▌   | 65930/100000 [00:57<00:30, 1124.60it/s]

Tokenize 

 65%|██████▍   | 64882/100000 [00:54<00:28, 1222.79it/s]

 ../llama.c-/data/TinyStories_all_data/data41.json

 59%|█████▊    | 58682/100000 [00:50<00:35, 1169.44it/s]




100%|██████████| 100000/100000 [01:25<00:00, 1168.57it/s]
 90%|█████████ | 90035/100000 [01:16<00:12, 823.64it/s] 

../instruct_dataset/ data13.bin (32839800,)


 89%|████████▉ | 89395/100000 [01:18<00:13, 794.16it/s]

Saved to 

 24%|██▍       | 24156/100000 [00:21<01:26, 879.92it/s]

 ../instruct_dataset/data13.bin
../instruct_dataset/

 90%|█████████ | 90119/100000 [01:16<00:11, 827.81it/s]

 labels13.bin (32839800,)


 89%|████████▉ | 89490/100000 [01:18<00:12, 837.76it/s]

Saved to  ../instruct_dataset/labels13.bin

 24%|██▍       | 24245/100000 [00:21<01:26, 878.57it/s]




 84%|████████▍ | 84390/100000 [01:13<00:15, 1015.48it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data26.json


100%|██████████| 100000/100000 [01:26<00:00, 1159.14it/s]
100%|██████████| 100000/100000 [01:29<00:00, 1121.40it/s]
 96%|█████████▌| 96014/100000 [01:25<00:04, 917.71it/s] 

../instruct_dataset/ data24.bin (32926600,)


 37%|███▋      | 36741/100000 [00:34<01:08, 918.51it/s]

Saved to  ../instruct_dataset/data24.bin
../instruct_dataset/

 96%|█████████▌| 96107/100000 [01:25<00:04, 904.81it/s]

 labels24.bin (32926600,)
Saved to 

 12%|█▏        | 11665/100000 [00:11<01:36, 918.47it/s]

 ../instruct_dataset/labels24.bin


 96%|█████████▋| 96310/100000 [01:25<00:03, 953.71it/s]

../instruct_dataset/ data27.bin (32921700,)


 96%|█████████▋| 96411/100000 [01:25<00:03, 968.48it/s]

Saved to  ../instruct_dataset/data27.bin
../instruct_dataset/ labels27.bin (32921700,)


 97%|█████████▋| 96509/100000 [01:25<00:03, 942.69it/s]

Saved to  ../instruct_dataset/labels27.bin


 38%|███▊      | 37735/100000 [00:35<01:02, 996.29it/s]]

Tokenize  

 97%|█████████▋| 97159/100000 [01:26<00:02, 1076.78it/s]

../llama.c-/data/TinyStories_all_data/data37.json


 38%|███▊      | 38164/100000 [00:35<00:58, 1048.11it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data09.json


100%|██████████| 100000/100000 [01:28<00:00, 1125.59it/s]
  5%|▌         | 5098/100000 [00:04<01:18, 1202.99it/s]]

../instruct_dataset/ data32.bin (32917150,)


 43%|████▎     | 43328/100000 [00:40<00:51, 1107.12it/s]

Saved to  ../instruct_dataset/data32.bin
../instruct_dataset/ labels32.bin (32917150,)


  5%|▌         | 5220/100000 [00:04<01:18, 1205.82it/s]

Saved to 

 18%|█▊        | 18286/100000 [00:17<01:08, 1186.19it/s]

 ../instruct_dataset/labels32.bin


  6%|▋         | 6283/100000 [00:05<01:23, 1125.41it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data05.json


100%|██████████| 100000/100000 [01:25<00:00, 1163.12it/s]
 81%|████████  | 80842/100000 [01:05<00:14, 1283.73it/s]

../instruct_dataset/ data41.bin (32917500,)


 70%|██████▉   | 69638/100000 [00:53<00:23, 1290.09it/s]

Saved to  ../instruct_dataset/data41.bin
../instruct_dataset/

 60%|█████▉    | 59809/100000 [00:47<00:32, 1252.98it/s]

 labels41.bin (32917500,)

 81%|████████  | 80990/100000 [01:05<00:14, 1339.43it/s]




 70%|██████▉   | 69782/100000 [00:53<00:22, 1332.82it/s]

Saved to  ../instruct_dataset/labels41.bin


 82%|████████▏ | 82425/100000 [01:07<00:12, 1409.16it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data45.json


100%|██████████| 100000/100000 [01:19<00:00, 1261.10it/s]
 19%|█▉        | 19133/100000 [00:14<01:02, 1294.39it/s]

../instruct_dataset/ data26.bin 

 87%|████████▋ | 86891/100000 [01:09<00:10, 1200.51it/s]

(32851700,)


 81%|████████  | 80979/100000 [01:03<00:16, 1183.85it/s]

Saved to  ../instruct_dataset/data26.bin
../instruct_dataset/ labels26.bin (32851700,)


 92%|█████████▏| 92202/100000 [01:08<00:05, 1305.64it/s]

Saved to 

 81%|████████  | 81107/100000 [01:03<00:15, 1208.68it/s]

 ../instruct_dataset/labels26.bin


 20%|██        | 20438/100000 [00:15<01:03, 1255.09it/s]

Tokenize  

 88%|████████▊ | 88183/100000 [01:10<00:09, 1285.23it/s]

../llama.c-/data/TinyStories_all_data/data16.json


100%|██████████| 100000/100000 [01:15<00:00, 1333.21it/s]
 98%|█████████▊| 97507/100000 [01:17<00:02, 1145.36it/s]

../instruct_dataset/ data09.bin (32870950,)


 10%|▉         | 9721/100000 [00:07<01:02, 1444.96it/s]

Saved to  ../instruct_dataset/data09.bin

 30%|██▉       | 29743/100000 [00:23<00:56, 1251.62it/s]


../instruct_dataset/ labels09.bin (32870950,)


 98%|█████████▊| 97632/100000 [01:17<00:02, 1173.03it/s]

Saved to  ../instruct_dataset/labels09.bin


 99%|█████████▉| 98773/100000 [01:18<00:00, 1288.30it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data00.json


100%|██████████| 100000/100000 [01:19<00:00, 1256.01it/s]
 97%|█████████▋| 97003/100000 [01:15<00:02, 1273.48it/s]

../instruct_dataset/ data37.bin 

 15%|█▌        | 15486/100000 [00:11<00:58, 1441.49it/s]

(32890200,)
Saved to  ../instruct_dataset/data37.bin

 35%|███▍      | 34932/100000 [00:27<00:53, 1209.04it/s]


../instruct_dataset/ labels37.bin

  4%|▍         | 4461/100000 [00:03<01:08, 1385.29it/s]

 (32890200,)


 16%|█▌        | 15636/100000 [00:11<00:57, 1458.09it/s]

Saved to  ../instruct_dataset/labels37.bin


 36%|███▌      | 36216/100000 [00:28<00:49, 1293.76it/s]

Tokenize 

  6%|▌         | 5896/100000 [00:04<01:07, 1390.65it/s]

 ../llama.c-/data/TinyStories_all_data/data48.json


 38%|███▊      | 37818/100000 [00:29<00:47, 1296.63it/s]]
 22%|██▏       | 21773/100000 [00:16<00:55, 1410.10it/s]

../instruct_dataset/ data05.bin

  4%|▍         | 4233/100000 [00:03<01:17, 1232.64it/s]

 (32864300,)
Saved to  ../instruct_dataset/data05.bin

 11%|█         | 10680/100000 [00:07<01:01, 1445.93it/s]




 41%|████      | 40598/100000 [00:31<00:47, 1253.53it/s]

../instruct_dataset/ labels05.bin (32864300,)


  4%|▍         | 4357/100000 [00:03<01:17, 1228.43it/s]]

Saved to  ../instruct_dataset/labels05.bin


  6%|▌         | 5513/100000 [00:04<01:14, 1268.23it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data12.json


100%|██████████| 100000/100000 [01:23<00:00, 1202.57it/s]
 67%|██████▋   | 66917/100000 [00:57<00:33, 1000.69it/s]

../instruct_dataset/ data45.bin (32910500,)


 84%|████████▍ | 84071/100000 [01:10<00:14, 1082.30it/s]

Saved to  

 67%|██████▋   | 67032/100000 [00:57<00:31, 1044.13it/s]

../instruct_dataset/data45.bin
../instruct_dataset/ labels45.bin (32910500,)
Saved to  ../instruct_dataset/labels45.bin


 68%|██████▊   | 68030/100000 [00:58<00:29, 1078.93it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data02.json


100%|██████████| 100000/100000 [01:24<00:00, 1178.66it/s]
 84%|████████▎ | 83541/100000 [01:10<00:15, 1062.55it/s]

../instruct_dataset/ data16.bin (32942350,)


 20%|█▉        | 19757/100000 [00:15<01:04, 1237.58it/s]

Saved to  ../instruct_dataset/data16.bin
../instruct_dataset/ labels16.bin (32942350,)


 20%|█▉        | 19882/100000 [00:16<01:04, 1239.45it/s]

Saved to  ../instruct_dataset/labels16.bin


 91%|█████████▏| 91390/100000 [01:20<00:07, 1118.32it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data44.json


100%|██████████| 100000/100000 [01:27<00:00, 1139.33it/s]
 10%|█         | 10372/100000 [00:10<01:25, 1045.94it/s]

../instruct_dataset/ 

100%|█████████▉| 99544/100000 [01:25<00:00, 1097.48it/s]

data00.bin (32850650,)


 34%|███▎      | 33504/100000 [00:27<00:57, 1159.39it/s]

Saved to  ../instruct_dataset/data00.bin
../instruct_dataset/ labels00.bin (32850650,)


 34%|███▎      | 33627/100000 [00:27<00:56, 1178.86it/s]

Saved to  ../instruct_dataset/labels00.bin


100%|██████████| 100000/100000 [01:26<00:00, 1157.07it/s]
 99%|█████████▉| 98802/100000 [01:22<00:00, 1241.04it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data35.json


100%|██████████| 100000/100000 [01:23<00:00, 1193.92it/s]
 13%|█▎        | 13099/100000 [00:12<01:25, 1018.11it/s]

../instruct_dataset/ data48.bin (32895450,)


 37%|███▋      | 36749/100000 [00:29<00:53, 1182.45it/s]

Saved to  ../instruct_dataset/data48.bin
../instruct_dataset/ labels48.bin (32895450,)


 13%|█▎        | 13201/100000 [00:12<01:27, 995.64it/s] 

Saved to  ../instruct_dataset/labels48.bin


  3%|▎         | 2729/100000 [00:02<01:33, 1042.57it/s]]

Tokenize  ../llama.c-/data/TinyStories_all_data/data31.json


 15%|█▍        | 14741/100000 [00:14<01:22, 1037.70it/s]

../instruct_dataset/ data12.bin (32865700,)


  1%|          | 762/100000 [00:00<01:19, 1252.67it/s]]

Saved to  ../instruct_dataset/data12.bin
../instruct_dataset/ labels12.bin (32865700,)


 15%|█▍        | 14845/100000 [00:14<01:23, 1018.25it/s]

Saved to  ../instruct_dataset/labels12.bin


 16%|█▌        | 15831/100000 [00:15<01:23, 1012.74it/s]

Tokenize  ../llama.c-/data/TinyStories_all_data/data08.json


100%|██████████| 100000/100000 [01:20<00:00, 1239.58it/s]
 65%|██████▍   | 64517/100000 [00:54<00:28, 1264.06it/s]

../instruct_dataset/ data02.bin (32832100,)


 75%|███████▌  | 75449/100000 [01:06<00:18, 1325.02it/s]

Saved to  ../instruct_dataset/data02.bin
../instruct_dataset/ labels02.bin (32832100,)


 65%|██████▍   | 64658/100000 [00:55<00:27, 1306.13it/s]

Saved to  ../instruct_dataset/labels02.bin

 66%|██████▋   | 66487/100000 [00:52<00:26, 1262.90it/s]




100%|██████████| 100000/100000 [01:24<00:00, 1187.57it/s]
 93%|█████████▎| 92932/100000 [01:12<00:05, 1267.02it/s]

../instruct_dataset/ data44.bin 

 91%|█████████ | 90862/100000 [01:11<00:07, 1245.25it/s]

(32884600,)
Saved to 

 92%|█████████▏| 92003/100000 [01:15<00:06, 1263.00it/s]

 ../instruct_dataset/data44.bin
../instruct_dataset/ labels44.bin (32884600,)


 91%|█████████ | 90996/100000 [01:11<00:07, 1271.94it/s]

Saved to  ../instruct_dataset/labels44.bin


100%|██████████| 100000/100000 [01:17<00:00, 1284.14it/s]
100%|██████████| 100000/100000 [01:21<00:00, 1233.36it/s]
100%|██████████| 100000/100000 [01:17<00:00, 1282.92it/s]


../instruct_dataset/ data31.bin (32855900,)
Saved to  ../instruct_dataset/data31.bin
../instruct_dataset/ labels31.bin (32855900,)
Saved to  ../instruct_dataset/labels31.bin
../instruct_dataset/ data35.bin (32884250,)
Saved to  ../instruct_dataset/data35.bin
../instruct_dataset/ labels35.bin (32884250,)
Saved to  ../instruct_dataset/labels35.bin
../instruct_dataset/ data08.bin (32878650,)
Saved to  ../instruct_dataset/data08.bin
../instruct_dataset/ labels08.bin (32878650,)
Saved to  ../instruct_dataset/labels08.bin


# check data

In [11]:


with open(out_data + 'data11.bin', 'rb') as f:
    x = f.read()
x = np.frombuffer(x, dtype=np.int16)



In [12]:
with open(out_data + 'labels11.bin', 'rb') as f:
    y = f.read()
y = np.frombuffer(y, dtype=np.int16)