# PUSH things to hub

In [27]:
from typing import List
from datasets import Dataset, DatasetDict, DatasetInfo
from transformers import AutoTokenizer

def dataset_push_to_hub(base_path: str, suffix_list: List[str], push_path: str, test_split: float = 0):
    for suffix in suffix_list:
        path = f"{base_path}_{suffix}"
        dataset_push_path = f"{push_path}_{suffix}"
        ds = Dataset.load_from_disk(path)
        if test_split > 0 and not isinstance(ds, DatasetDict):
            ds = ds.train_test_split(test_size=test_split) # Split the dataset into train (90%) and validation (10%)
        ds.push_to_hub(dataset_push_path, private=True)

def tokenizer_push_to_hub(base_path: str, suffix_list: List[str], push_path: str):
    for suffix in suffix_list:
        path = f"{base_path}-{suffix}"
        tok_push_path = f"{push_path}-{suffix}"
        tokenizer = AutoTokenizer.from_pretrained(path)
        tokenizer.push_to_hub(tok_push_path)


In [None]:
base_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/dictionary-llama32-500000samples-tokenized_dict"
push_path = "tomg-group-umd/EIM-dataset-Llama32-Dict"
token_sizes = ["10", "100", "500", "1000"]
ds_dict = dataset_push_to_hub(base_path, token_sizes, push_path)

In [28]:
base_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/magpie-default-tokenized"
push_path = "tomg-group-umd/EIM-dataset-Llama32-magpie-default"
token_sizes = ["0"]
# token_sizes = ["0", "10", "100", "500", "1000"]
ds_dict = dataset_push_to_hub(base_path, token_sizes, push_path, test_split=.1)

Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.04ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.62ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.63ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.74ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.71ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.54ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.72ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.76ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.69ba/s]
Uploading the dataset shards: 100%|██████████| 9/9 [00:46<00:00,  5.18s/it]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.35s/it]


In [2]:
from chat_templating import visualize_loss_mask
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
ds_path = "tomg-group-umd/EIM-dataset-Llama32-magpie-default_0"
ds_loaded = load_dataset(ds_path)
# path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/dictionary-llama32-500000samples-tokenized_dict-100"
# ds = Dataset.load_from_disk(path)
ds = ds_loaded["train"].select(range(100))
sample_no = 1
# print(ds[sample_no])
output = visualize_loss_mask(
    input_ids=ds["input_ids"][sample_no],
    tokenizer=tokenizer,
    loss_mask=ds["loss_mask"][sample_no],
    new_token_id_threshold=len(tokenizer)  
)
print(output)




<|begin_of_text|> <|start_header_id|> system <|end_header_id|> Cut   ting ĠKnowledge ĠDate :  ĠDecember Ġ   202  3  Today ĠDate :  Ġ   12  ĠMay Ġ   202  5  You  Ġare Ġa  Ġhelpful Ġassistant . 
[1;37m128000[0m            [1;37m128006[0m              [1;37m9125[0m   [1;37m128007[0m            [1;37m38766[0m [1;37m1303[0m [1;37m33025[0m      [1;37m2696[0m  [1;37m25[0m [1;37m6790[0m      [1;37m220[0m [1;37m2366[0m [1;37m18[0m [1;37m15724[0m [1;37m2696[0m  [1;37m25[0m [1;37m220[0m [1;37m717[0m [1;37m3297[0m [1;37m220[0m [1;37m2366[0m [1;37m20[0m [1;37m2675[0m [1;37m527[0m  [1;37m264[0m [1;37m11190[0m    [1;37m18328[0m      [1;37m13[0m

<|eot_id|> <|start_header_id|> user <|end_header_id|> Can  Ġyou Ġsummarize Ġthe Ġmain Ġpoints Ġfrom Ġ"  G  ut  ĠFeel ings :  ĠThe ĠIntelligence Ġof Ġthe ĠUn  conscious " Ġby ĠG  erd   ĠG  iger en 
[1;37m128009[0m     [1;37m128006[0m              [1;37m882[0m  [1;37m128007[0m            [1;37m685