In [1]:
%env AWS_PROFILE=dev-admin
%env AWS_REGION=us-east-1
%env HF_HOME=~/.cache/huggingface
%env TOKENIZERS_PARALLELISM=fale

env: AWS_PROFILE=dev-admin
env: AWS_REGION=us-east-1
env: HF_HOME=~/.cache/huggingface
env: TOKENIZERS_PARALLELISM=fale


In [2]:
from scripts.aws_init import init_sagemaker

sagemaker_session_bucket = "sagemaker-ms-thesis-llm"
role = "arn:aws:iam::171706357329:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole"

sess = init_sagemaker(sagemaker_session_bucket)

sagemaker bucket: sagemaker-ms-thesis-llm
sagemaker session region: us-east-1


In [3]:

from datasets import load_dataset
from random import randrange

tokenizer = None

cutoff_len = 512

dataset = load_dataset("tiedong/goat", split="train")
print(f"dataset size: {len(dataset)}")
print(f"Example Datapoint {dataset[randrange(len(dataset))]}")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset json (/Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


dataset size: 1746300
Example Datapoint {'instruction': '97466792792776/8', 'output': '97466792792776 / 8 = 12183349099097', 'answer': '12183349099097', 'input': '97466792792776 / 8'}


In [None]:
import math

n_samples = 1000
n_shards = math.ceil(len(dataset)/n_samples)
dataset_sampled = dataset.shard(n_shards,2)
print(f"dataset_sample size: {len(dataset_sampled)}")

In [5]:
train_val = dataset.train_test_split(
    test_size=0.15, shuffle=True, seed=42
)

In [6]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token



In [7]:
def format_goat(sample):
    return f"<s>[INST] {sample['instruction']} [/INST] {sample['output']}</s>"
    

In [11]:
from prompter import Prompter

prompter = Prompter()

def generate_and_tokenize_prompt(data_point):
        full_prompt = format_goat(data_point)
        # print(f"Full Prompt: {full_prompt}")
        tokenized_full_prompt = tokenizer(full_prompt)
        return tokenized_full_prompt

In [9]:
from itertools import chain
from functools import partial

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    batch_chunk_length = 0
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
    else:
        batch_chunk_length = batch_total_length
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result

In [12]:
from random import randint

# apply prompt template per sample
train_data = (
    train_val["train"].map(
        generate_and_tokenize_prompt, remove_columns=list(dataset.features)
    ).map(
        partial(chunk, chunk_length=2048), batched=True,
    )
)
val_data = (
    train_val["test"].map(
        generate_and_tokenize_prompt, remove_columns=list(dataset.features)
    ).map(
        partial(chunk, chunk_length=2048), batched=True,
    )
)
# print random sample
print(f"Training Data Point: {train_data[randint(0, len(train_data)-1)]}")
print(f"Tuning Set: {val_data[randint(0, len(val_data)-1)]}")

                                                                        

Training Data Point: {'input_ids': [993, 322, 21162, 310, 29871, 29945, 29906, 29947, 29900, 29929, 29929, 29947, 29955, 29947, 29955, 29947, 29896, 322, 29871, 29896, 29945, 29906, 29955, 29906, 29955, 29889, 518, 29914, 25580, 29962, 29871, 29945, 29906, 29947, 29900, 29929, 29929, 29947, 29955, 29947, 29955, 29947, 29896, 448, 29871, 29896, 29945, 29906, 29955, 29906, 29955, 334, 29871, 29941, 29900, 29900, 29900, 29900, 29900, 29900, 353, 29871, 29945, 29906, 29947, 29900, 29929, 29929, 29947, 29955, 29947, 29955, 29947, 29896, 448, 29871, 29946, 29945, 29947, 29896, 29947, 29896, 29900, 29900, 29900, 29900, 29900, 29900, 353, 29871, 29953, 29929, 29929, 29896, 29947, 29947, 29955, 29947, 29955, 29947, 29896, 13, 29953, 29929, 29929, 29896, 29947, 29947, 29955, 29947, 29955, 29947, 29896, 448, 29871, 29896, 29945, 29906, 29955, 29906, 29955, 334, 29871, 29946, 29900, 29900, 29900, 29900, 29900, 353, 29871, 29953, 29929, 29929, 29896, 29947, 29947, 29955, 29947, 29955, 29947, 29896,



In [13]:
from huggingface_hub import login

hub_token="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"
login(token=hub_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/andrewbeiler/.cache/huggingface/token
Login successful


In [14]:
train_data.push_to_hub("abeiler/GOAT-Dataset", private=True)

Creating parquet from Arrow format: 100%|██████████| 19/19 [00:02<00:00,  6.49ba/s]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:03<00:00,  6.30ba/s]s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:02<00:00,  6.44ba/s]s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:03<00:00,  6.32ba/s]s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:02<00:00,  6.37ba/s]s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 5/5 [00:37<00:00,  7.50s/it]


In [None]:
# save train_dataset to s3
ver = "v10"
dataset_name = "goat"
training_data_path = f's3://{sagemaker_session_bucket}/datasets/{dataset_name}/{ver}/training'
training_val_path = f's3://{sagemaker_session_bucket}/datasets/{dataset_name}/{ver}/validation'
train_data.save_to_disk(training_data_path)
val_data.save_to_disk(training_val_path)

print(f"Training Dataset Saved to: {training_data_path}")
print(f"Validation Dataset Saved to: {training_val_path}")

                                                                                            

Training Dataset Saved to: s3://sagemaker-ms-thesis-llm/datasets/goat/v10/training
Validation Dataset Saved to: s3://sagemaker-ms-thesis-llm/datasets/goat/v10/validation
