In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from datasets import load_dataset

wikids = load_dataset(
    path="wikimedia/wikipedia",     # dataset namespace and name
    name="20231101.en",             # snapshot date + language code
    split="train[0:1%]",            # subset of the split
    cache_dir=None,                 # or set to a folder path if you want custom cache
    keep_in_memory=False,           # keep in RAM (default False)
    download_mode=None,             # set to "reuse_dataset_if_exists" or "force_redownload"
    verification_mode=None,         # can be "no_checks", "basic", or "all_checks"
    ignore_verifications=False,     # skip validation checks
    use_auth_token=None             # only needed for private datasets
)



Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [12]:
wikids

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 64078
})

In [13]:
wikids[0]

{'id': '12',
 'url': 'https://en.wikipedia.org/wiki/Anarchism',
 'title': 'Anarchism',
 'text': 'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).\n\nHumans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. Dur

In [14]:
wikids['text'][0][:500]  # first 500 characters of the first document

'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as'

In [15]:
wikids[0]['text'][:500]

'Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as'

In [16]:
wikids = wikids.train_test_split(test_size=0.1)
wikids

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 57670
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6408
    })
})

In [18]:
from transformers import AutoTokenizer

model_id = "dphn/Dolphin-Mistral-24B-Venice-Edition"    # https://huggingface.co/dphn/Dolphin-Mistral-24B-Venice-Edition

In [20]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_id
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [27]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '</s>',
 'additional_special_tokens': ['<unk>',
  '<s>',
  '</s>',
  '[INST]',
  '[/INST]',
  '[AVAILABLE_TOOLS]',
  '[/AVAILABLE_TOOLS]',
  '[TOOL_RESULTS]',
  '[/TOOL_RESULTS]',
  '[TOOL_CALLS]',
  '[IMG]',
  '<pad>',
  '[IMG_BREAK]',
  '[IMG_END]',
  '[PREFIX]',
  '[MIDDLE]',
  '[SUFFIX]',
  '[SYSTEM_PROMPT]',
  '[/SYSTEM_PROMPT]',
  '[TOOL_CONTENT]',
  '<SPECIAL_20>',
  '<SPECIAL_21>',
  '<SPECIAL_22>',
  '<SPECIAL_23>',
  '<SPECIAL_24>',
  '<SPECIAL_25>',
  '<SPECIAL_26>',
  '<SPECIAL_27>',
  '<SPECIAL_28>',
  '<SPECIAL_29>',
  '<SPECIAL_30>',
  '<SPECIAL_31>',
  '<SPECIAL_32>',
  '<SPECIAL_33>',
  '<SPECIAL_34>',
  '<SPECIAL_35>',
  '<SPECIAL_36>',
  '<SPECIAL_37>',
  '<SPECIAL_38>',
  '<SPECIAL_39>',
  '<SPECIAL_40>',
  '<SPECIAL_41>',
  '<SPECIAL_42>',
  '<SPECIAL_43>',
  '<SPECIAL_44>',
  '<SPECIAL_45>',
  '<SPECIAL_46>',
  '<SPECIAL_47>',
  '<SPECIAL_48>',
  '<SPECIAL_49>',
  '<SPECIAL_50>',
  '<S

In [30]:
tokenizer.pad_token

'</s>'

In [40]:
out0 = tokenizer(wikids['train']['text'][0], max_length=10_000, padding='max_length', padding_side='left')
out0.keys()

dict_keys(['input_ids', 'attention_mask'])

In [47]:
out0['input_ids'][-10:], out0['input_ids'][:10]

([5150, 98682, 3742, 1010, 1050, 1048, 1411, 28570, 5150, 3306],
 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [45]:
out0['attention_mask'][-10:], out0['attention_mask'][:10]

([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [49]:
tokenizer.pad_token_id

2

In [53]:
def tokenize_function(sample):
    return tokenizer(
        sample['text'],
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        add_special_tokens=True
    )


tokenized = wikids.map(tokenize_function, batched=True, remove_columns=wikids['train'].column_names)

Map:   0%|          | 0/57670 [00:00<?, ? examples/s]

Map:   0%|          | 0/6408 [00:00<?, ? examples/s]

In [54]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 57670
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6408
    })
})