In [None]:
from dotenv import load_dotenv
import os
load_dotenv()
HF_TOKEN_read = os.getenv("HF_TOKEN_read")

In [None]:
from huggingface_hub import login
login(token = HF_TOKEN_read)

In [None]:
import datasets

dataset = datasets.load_dataset(
    "parquet", 
    data_files="./Pre-Training-Dataset/Preprocessed_PT_Dataset.parquet", 
    split="train"
)
print(dataset)

In [None]:
from transformers import AutoTokenizer
model_path_or_name = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(
    model_path_or_name, 
    use_fast=False
)

In [3]:
def tokenization(example):
    # Tokenize
    tokens = tokenizer.tokenize(example["text"])

    # Convert tokens to ids
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Add <bos>, <eos> tokens to the front and back of tokens_ids 
    # bos: begin of sequence, eos: end of sequence
    token_ids = [
        tokenizer.bos_token_id] \
        + token_ids \
        + [tokenizer.eos_token_id
    ]
    example["input_ids"] = token_ids

    # We will be using this column to count the total number of tokens 
    # in the final dataset
    example["num_tokens"] = len(token_ids)
    return example

In [None]:
dataset = dataset.map(tokenization, load_from_cache_file=False)
print(dataset)

In [None]:
sample = dataset[3]

print("text", sample["text"][:30]) # 
print("\ninput_ids", sample["input_ids"][:30])
print("\nnum_tokens", sample["num_tokens"])

In [None]:
import numpy as np
np.sum(dataset["num_tokens"])

## 2. Packing the data

In [None]:
input_ids = np.concatenate(dataset["input_ids"])
print(len(input_ids))

In [8]:
max_seq_length = 4096

In [None]:
total_length = len(input_ids) - len(input_ids) % max_seq_length
print(total_length)

In [None]:
input_ids = input_ids[:total_length]
print(input_ids.shape)

In [None]:
input_ids_reshaped = input_ids.reshape(-1, max_seq_length).astype(np.int32)
input_ids_reshaped.shape  

In [None]:
type(input_ids_reshaped)

In [None]:
input_ids_list = input_ids_reshaped.tolist()
packaged_pretrain_dataset = datasets.Dataset.from_dict(
    {"input_ids": input_ids_list}
)
print(packaged_pretrain_dataset)

In [None]:
directory = "./Pre-Training-Dataset"

if not os.path.exists(directory):
    os.makedirs(directory)

file_path = os.path.join(directory, "packaged_pretrain_Dataset.parquet")
packaged_pretrain_dataset.to_parquet(file_path)