In [1]:
%env AWS_PROFILE=dev-admin
%env AWS_REGION=us-east-1
%env HF_HOME=~/.cache/huggingface
%env TOKENIZERS_PARALLELISM=fale

env: AWS_PROFILE=dev-admin
env: AWS_REGION=us-east-1
env: HF_HOME=~/.cache/huggingface
env: TOKENIZERS_PARALLELISM=fale


In [2]:
from scripts.aws_init import init_sagemaker

sagemaker_session_bucket = "sagemaker-ms-thesis-llm"
role = "arn:aws:iam::171706357329:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole"

sess = init_sagemaker(sagemaker_session_bucket)

sagemaker bucket: sagemaker-ms-thesis-llm
sagemaker session region: us-east-1


In [3]:

from datasets import load_dataset
from random import randrange

tokenizer = None

cutoff_len = 512

dataset = load_dataset("tiedong/goat", split="train")
print(f"dataset size: {len(dataset)}")
print(f"Example Datapoint {dataset[randrange(len(dataset))]}")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset json (/Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


dataset size: 1746300
Example Datapoint {'instruction': '1449119*27', 'output': '1449119 * 27 = 1449119 * (20 + 7) = 1449119 * 20 + 1449119 * 7 = 28982380 + 10143833 = 39126213', 'answer': '39126213', 'input': '1449119 * 27'}


In [14]:
import math

n_samples = 10#0000
n_shards = math.ceil(len(dataset)/n_samples)
dataset_sampled = dataset.shard(n_shards,2)
print(f"dataset_sample size: {len(dataset_sampled)}")

dataset_sample size: 10


In [15]:
train_val = dataset_sampled.train_test_split(
    test_size=0.15, shuffle=True, seed=42
)

In [6]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"  # Allow batched inference

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [7]:
def tokenize(prompt, add_eos_token=True):
        result = tokenizer(
            prompt,
            # truncation=True,
            # max_length=cutoff_len,
            # return_tensors=None,
        )
        # print(f"Result: {result}")
        # if (
        #     result["input_ids"][-1] != tokenizer.eos_token_id
        #     and len(result["input_ids"]) < cutoff_len
        #     and add_eos_token
        # ):
        #     result["input_ids"].append(tokenizer.eos_token_id)
        #     result["attention_mask"].append(1)

        # result["labels"] = result["input_ids"].copy()

        return result

In [8]:
def format_goat(sample):
    # instruction = f"### Instruction\n{sample['instruction']}"
    # # context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    # response = f"### Answer\n{sample['output']}"
    # # join all the parts together
    # # prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    # prompt = "\n\n".join([i for i in [instruction, response] if i is not None])
    # return prompt
    return f"<s>[INST] {sample['instruction']} [/INST] {sample['output']}</s>"
    

In [12]:
from prompter import Prompter

prompter = Prompter()

def generate_and_tokenize_prompt(data_point):
        # full_prompt = prompter.generate_prompt(
        #     data_point["instruction"],
        #     data_point["output"],
        # )
        full_prompt = format_goat(data_point)
        print(f"Full Prompt: {full_prompt}")
        tokenized_full_prompt = tokenize(full_prompt)
        # print(f"Tokenized Prompt: {tokenized_full_prompt}")
        return tokenized_full_prompt

In [10]:
from itertools import chain
from functools import partial

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    batch_chunk_length = 0
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
    else:
        batch_chunk_length = batch_total_length
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# # tokenize and chunk dataset
# lm_dataset = dataset_sampled.map(
#     lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset_sampled.features)
# ).map(
#     partial(chunk, chunk_length=2048),
#     batched=True,
# )

# # Print total number of samples
# print(f"Total number of samples: {len(lm_dataset)}")

In [16]:
from random import randint

# apply prompt template per sample
train_data = (
    train_val["train"].map(
        generate_and_tokenize_prompt, remove_columns=list(dataset.features)
    ).map(
        partial(chunk, chunk_length=2048), batched=True,
    )
)
val_data = (
    train_val["test"].map(
        generate_and_tokenize_prompt, remove_columns=list(dataset.features)
    ).map(
        partial(chunk, chunk_length=2048), batched=True,
    )
)
# train_data = dataset_sampled.map(generate_and_tokenize_prompt, remove_columns=list(dataset.features))
# print random sample
print(f"Training Data Point: {train_data[randint(0, len(train_data)-1)]}")
print(f"Tuning Set: {val_data[randint(0, len(val_data)-1)]}")

Loading cached processed dataset at /Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-434482acd5484a9b.arrow
Loading cached processed dataset at /Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-f16f2df15bfe1847.arrow
Loading cached processed dataset at /Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-ba0473180637368d.arrow
Loading cached processed dataset at /Users/andrewbeiler/.cache/huggingface/datasets/tiedong___json/tiedong--goat-55b7467c033a1462/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-5fcb74cd59e3d8f1.arrow


Training Data Point: {'input_ids': [1, 1, 518, 25580, 29962, 20535, 403, 278, 1234, 304, 29871, 29953, 29953, 29906, 29941, 29896, 29953, 29896, 29941, 29906, 29953, 29955, 29955, 29896, 29955, 29974, 29953, 29900, 29947, 29947, 29906, 29946, 29900, 29929, 29947, 29900, 29929, 29889, 518, 29914, 25580, 29962, 29871, 29953, 29953, 29906, 29941, 29896, 29953, 29896, 29941, 29906, 29953, 29955, 29955, 29896, 29955, 718, 29871, 29953, 29900, 29947, 29947, 29906, 29946, 29900, 29929, 29947, 29900, 29929, 353, 29871, 29953, 29953, 29906, 29929, 29906, 29946, 29929, 29945, 29953, 29955, 29955, 29945, 29906, 29953, 2, 1, 1, 518, 25580, 29962, 29871, 29941, 29953, 29896, 29900, 29941, 29946, 29906, 29929, 29947, 29929, 29945, 29953, 29945, 29900, 29946, 29914, 29946, 518, 29914, 25580, 29962, 29871, 29941, 29953, 29896, 29900, 29941, 29946, 29906, 29929, 29947, 29929, 29945, 29953, 29945, 29900, 29946, 847, 29871, 29946, 353, 29871, 29929, 29900, 29906, 29945, 29947, 29945, 29955, 29946, 29955,

In [17]:
# save train_dataset to s3
ver = "v9"
dataset_name = "goat"
training_data_path = f's3://{sagemaker_session_bucket}/datasets/{dataset_name}/{ver}/training'
training_val_path = f's3://{sagemaker_session_bucket}/datasets/{dataset_name}/{ver}/validation'
train_data.save_to_disk(training_data_path)
val_data.save_to_disk(training_val_path)

print(f"Training Dataset Saved to: {training_data_path}")
print(f"Validation Dataset Saved to: {training_val_path}")

                                                                                     

Training Dataset Saved to: s3://sagemaker-ms-thesis-llm/datasets/goat/v9/training
Validation Dataset Saved to: s3://sagemaker-ms-thesis-llm/datasets/goat/v9/validation


In [37]:
import telegram
import asyncio

api_key = '***REMOVED***'
usr_id = '***REMOVED***'

bot = telegram.Bot(token=api_key)
async with bot:
        await bot.send_message(chat_id=usr_id, text='Hey!')