In [1]:
%env AWS_PROFILE=dev-admin
%env AWS_REGION=us-east-1
%env HF_HOME=~/.cache/huggingface
%env TOKENIZERS_PARALLELISM=fale

env: AWS_PROFILE=dev-admin
env: AWS_REGION=us-east-1
env: HF_HOME=~/.cache/huggingface
env: TOKENIZERS_PARALLELISM=fale


In [2]:
from scripts.aws_init import init_sagemaker

sagemaker_session_bucket = "sagemaker-ms-thesis-llm"
role = "arn:aws:iam::171706357329:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole"

sess = init_sagemaker(sagemaker_session_bucket)

sagemaker bucket: sagemaker-ms-thesis-llm
sagemaker session region: us-east-1


In [3]:

from datasets import load_dataset
from random import randrange

tokenizer = None

cutoff_len = 512

dataset = load_dataset("tiedong/goat", split="train")
print(f"dataset size: {len(dataset)}")
print(f"Example Datapoint {dataset[randrange(len(dataset))]}")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset json (/Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


dataset size: 1746300
Example Datapoint {'instruction': 'I am looking for the value of 6494x7517. Can you help?', 'output': '6494 * 7517 = 7517 * (6000 + 400 + 90 + 4) = 7517 * 6000 + 7517 * 400 + 7517 * 90 + 7517 * 4 = 45102000 + 3006800 + 676530 + 30068 = 48108800 + 676530 + 30068 = 48785330 + 30068 = 48815398', 'answer': '48815398', 'input': '6494 * 7517'}


In [4]:
import math

n_samples = 100#0000
n_shards = math.ceil(len(dataset)/n_samples)
dataset_sampled = dataset.shard(n_shards,2)
print(f"dataset_sample size: {len(dataset_sampled)}")

dataset_sample size: 100


In [13]:
train_val = dataset_sampled.train_test_split(
    test_size=15, shuffle=True, seed=42
)

Loading cached split indices for dataset at /Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-7816250a6f6164d8.arrow and /Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-4b71e97e26adc63f.arrow


In [5]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"  # Allow batched inference

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [23]:
def tokenize(prompt, add_eos_token=True):
        result = tokenizer(
            prompt,
            # truncation=True,
            # max_length=cutoff_len,
            # return_tensors=None,
        )
        # print(f"Result: {result}")
        # if (
        #     result["input_ids"][-1] != tokenizer.eos_token_id
        #     and len(result["input_ids"]) < cutoff_len
        #     and add_eos_token
        # ):
        #     result["input_ids"].append(tokenizer.eos_token_id)
        #     result["attention_mask"].append(1)

        # result["labels"] = result["input_ids"].copy()

        return result

In [7]:
from prompter import Prompter

prompter = Prompter()

def generate_and_tokenize_prompt(data_point):
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
            data_point["output"],
        )
        # print(f"Full Prompt: {full_prompt}")
        tokenized_full_prompt = tokenize(full_prompt)
        # print(f"Tokenized Prompt: {tokenized_full_prompt}")
        return tokenized_full_prompt

In [21]:
from itertools import chain
from functools import partial

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    batch_chunk_length = 0
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
    else:
        batch_chunk_length = batch_total_length
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# # tokenize and chunk dataset
# lm_dataset = dataset_sampled.map(
#     lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset_sampled.features)
# ).map(
#     partial(chunk, chunk_length=2048),
#     batched=True,
# )

# # Print total number of samples
# print(f"Total number of samples: {len(lm_dataset)}")

In [24]:
from random import randint

# apply prompt template per sample
train_data = (
    train_val["train"].map(
        generate_and_tokenize_prompt, remove_columns=list(dataset.features)
    ).map(
        partial(chunk, chunk_length=2048), batched=True,
    )
)
val_data = (
    train_val["test"].map(
        generate_and_tokenize_prompt, remove_columns=list(dataset.features)
    ).map(
        partial(chunk, chunk_length=2048), batched=True,
    )
)
# train_data = dataset_sampled.map(generate_and_tokenize_prompt, remove_columns=list(dataset.features))
# print random sample
print(f"Training Data Point: {train_data[randint(0, len(train_data)-1)]}")
print(f"Tuning Set: {val_data[randint(0, len(val_data)-1)]}")

                                                  

Full Prompt: What is the numerical outcome of 8108864 / 58?
Answer: 8108864 - 58 * 100000 = 8108864 - 5800000 = 2308864
2308864 - 58 * 30000 = 2308864 - 1740000 = 568864
568864 - 58 * 9000 = 568864 - 522000 = 46864
46864 - 58 * 800 = 46864 - 46400 = 464
464 - 58 * 8 = 464 - 464 = 0
Therefore, 8108864 / 58 = 139808
Tokenized Prompt: {'input_ids': [1, 1724, 338, 278, 16259, 21957, 310, 29871, 29947, 29896, 29900, 29947, 29947, 29953, 29946, 847, 29871, 29945, 29947, 29973, 13, 22550, 29901, 29871, 29947, 29896, 29900, 29947, 29947, 29953, 29946, 448, 29871, 29945, 29947, 334, 29871, 29896, 29900, 29900, 29900, 29900, 29900, 353, 29871, 29947, 29896, 29900, 29947, 29947, 29953, 29946, 448, 29871, 29945, 29947, 29900, 29900, 29900, 29900, 29900, 353, 29871, 29906, 29941, 29900, 29947, 29947, 29953, 29946, 13, 29906, 29941, 29900, 29947, 29947, 29953, 29946, 448, 29871, 29945, 29947, 334, 29871, 29941, 29900, 29900, 29900, 29900, 353, 29871, 29906, 29941, 29900, 29947, 29947, 29953, 29946, 

                                                  

9765
2048


                                                  

Full Prompt: Would you mind helping me calculate 2944861 * 68? I'm a bit stuck.
Answer: 2944861 * 68 = 2944861 * (60 + 8) = 2944861 * 60 + 2944861 * 8 = 176691660 + 23558888 = 200250548
Tokenized Prompt: {'input_ids': [1, 10878, 366, 3458, 19912, 592, 8147, 29871, 29906, 29929, 29946, 29946, 29947, 29953, 29896, 334, 29871, 29953, 29947, 29973, 306, 29915, 29885, 263, 2586, 10771, 29889, 13, 22550, 29901, 29871, 29906, 29929, 29946, 29946, 29947, 29953, 29896, 334, 29871, 29953, 29947, 353, 29871, 29906, 29929, 29946, 29946, 29947, 29953, 29896, 334, 313, 29953, 29900, 718, 29871, 29947, 29897, 353, 29871, 29906, 29929, 29946, 29946, 29947, 29953, 29896, 334, 29871, 29953, 29900, 718, 29871, 29906, 29929, 29946, 29946, 29947, 29953, 29896, 334, 29871, 29947, 353, 29871, 29896, 29955, 29953, 29953, 29929, 29896, 29953, 29953, 29900, 718, 29871, 29906, 29941, 29945, 29945, 29947, 29947, 29947, 29947, 353, 29871, 29906, 29900, 29900, 29906, 29945, 29900, 29945, 29946, 29947], 'attention_m

                                                  

3554
2048
Training Data Point: {'input_ids': [29929, 29955, 29945, 29947, 29941, 29947, 29947, 448, 29871, 29946, 29896, 29929, 29900, 29953, 29953, 29900, 29945, 29941, 29947, 29906, 29955, 29900, 29947, 29953, 13, 22550, 29901, 29871, 29896, 29929, 29906, 29946, 29953, 29896, 29900, 29906, 29929, 29955, 29945, 29947, 29941, 29947, 29947, 448, 29871, 29946, 29896, 29929, 29900, 29953, 29953, 29900, 29945, 29941, 29947, 29906, 29955, 29900, 29947, 29953, 353, 448, 29906, 29906, 29953, 29953, 29900, 29945, 29900, 29906, 29946, 29900, 29953, 29947, 29953, 29929, 29947, 1, 29871, 29946, 29953, 29900, 29953, 29906, 29947, 29955, 29896, 29896, 29900, 29929, 29914, 29946, 29953, 29955, 29900, 29929, 29953, 338, 13, 22550, 29901, 29871, 29946, 29953, 29900, 29953, 29906, 29947, 29955, 29896, 29896, 29900, 29929, 448, 29871, 29946, 29953, 29955, 29900, 29929, 29953, 334, 29871, 29929, 29900, 29900, 29900, 29900, 353, 29871, 29946, 29953, 29900, 29953, 29906, 29947, 29955, 29896, 29896, 29900, 



In [25]:
# save train_dataset to s3
ver = "v8"
dataset_name = "goat"
training_data_path = f's3://{sagemaker_session_bucket}/datasets/{dataset_name}/{ver}/training'
training_val_path = f's3://{sagemaker_session_bucket}/datasets/{dataset_name}/{ver}/validation'
train_data.save_to_disk(training_data_path)
val_data.save_to_disk(training_val_path)

print(f"Training Dataset Saved to: {training_data_path}")
print(f"Validation Dataset Saved to: {training_val_path}")

                                                                                     

Training Dataset Saved to: s3://sagemaker-ms-thesis-llm/datasets/goat/v8/training
Validation Dataset Saved to: s3://sagemaker-ms-thesis-llm/datasets/goat/v8/validation
