# PUSH things to hub

In [27]:
from typing import List
from datasets import Dataset, DatasetDict, DatasetInfo
from transformers import AutoTokenizer

def dataset_push_to_hub(base_path: str, suffix_list: List[str], push_path: str, test_split: float = 0):
    for suffix in suffix_list:
        path = f"{base_path}_{suffix}"
        dataset_push_path = f"{push_path}_{suffix}"
        ds = Dataset.load_from_disk(path)
        if test_split > 0 and not isinstance(ds, DatasetDict):
            ds = ds.train_test_split(test_size=test_split) # Split the dataset into train (90%) and validation (10%)
        ds.push_to_hub(dataset_push_path, private=True)

def tokenizer_push_to_hub(base_path: str, suffix_list: List[str], push_path: str):
    for suffix in suffix_list:
        path = f"{base_path}-{suffix}"
        tok_push_path = f"{push_path}-{suffix}"
        tokenizer = AutoTokenizer.from_pretrained(path)
        tokenizer.push_to_hub(tok_push_path)


In [None]:
base_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/dictionary-llama32-500000samples-tokenized_dict"
push_path = "tomg-group-umd/EIM-dataset-Llama32-Dict"
token_sizes = ["10", "100", "500", "1000"]
ds_dict = dataset_push_to_hub(base_path, token_sizes, push_path)

In [28]:
base_path = "/cmlscratch/astein0/efficient_tokenization_for_inference/datasets/magpie-default-tokenized"
push_path = "tomg-group-umd/EIM-dataset-Llama32-magpie-default"
token_sizes = ["0"]
# token_sizes = ["0", "10", "100", "500", "1000"]
ds_dict = dataset_push_to_hub(base_path, token_sizes, push_path, test_split=.1)

Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.04ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.62ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.63ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.74ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.71ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.54ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.72ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.76ba/s]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.69ba/s]
Uploading the dataset shards: 100%|██████████| 9/9 [00:46<00:00,  5.18s/it]
Creating parquet from Arrow format: 100%|██████████| 30/30 [00:03<00:00,  9.73ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.35s/it]


In [18]:
ds_dict["0"]["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'loss_mask', 'labels', 'task_type', 'num_tokens'],
    num_rows: 270000
})