In [None]:
# python -m vllm.entrypoints.openai.api_server --host 127.0.0.1 --port 8881 --model amew0/Meta-Llama-3-8B-Instruct-v240714045919 --dtype float16 --download-dir /dpc/kunf0097/l3-8b/model & ../ngrok http 8881

In [None]:
qnas = " \n\n### 1\nQ: What is the patient's liver condition based on the provided lab results?\nM: cirrhosis, chronic hepatitis, liver damage, liver failure\nA: chronic hepatitis\n\n### 2\nQ: What is the patient's serum albumin level?\nM: 3.4, 7.54, 11.3, 15.6\nA: 3.4\n\n### 3\nQ: Which of the following is a recommendation for the patient's treatment?\nM: avoid fatty diet, take beta blocker, take more sugar cane juice, consult gastroenterologist\nA: consult gastroenterologist\n\n### 4\nQ: What is the patient's bilirubin level?\nM: 2.38, 4.88, 17.16, 25.8\nA: 17.16\n\n### 5\nQ: What is the patient's creatinine level?\nM: 2.5, 4.88, 7.5, 10.5\nA: 4.88\n"

In [None]:
start_index = qnas.find("### 1")

qna_section = qnas[start_index:].strip()

# Skip the empty string before the first ###
qna_blocks = qna_section.split("### ")[1:]


qnas_formatted = []
for block in qna_blocks:
    lines = block.strip()
    q = lines[lines.find("Q: "):lines.find("A: ")].strip()
    a = lines[lines.find("A: "):].strip()
    
    qnas_formatted.append({"Q": q, "A":a})

In [None]:
qnas_formatted

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import os
import torch
import transformers
import huggingface_hub
import wandb
from scipy.stats import pearsonr
from datetime import datetime
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
import gc
import json
import yaml
import argparse
import re
from tqdm import tqdm
import fire
import inspect


logg = lambda x: print(f"------------------------ {x} ---------------------------")


def inspectt(frame):
    logg("")
    args, _, _, values = inspect.getargvalues(frame)
    for arg in args:
        print(f"\t{arg}: {values[arg]}")
    logg("")

def get_prompts_from_template(filepath, name, eval_name):
    default_config = {
        "max_new_tokens": 256,
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    }
    with open(filepath, "r") as f:
        data = yaml.safe_load(f)

    candidate_prompt = data[name]["candidate_prompt"]
    evaluator_prompt = data[eval_name]["evaluator_prompt"]
    candidate_generation_config = data[name].get("candidate_generation_config", default_config)
    evaluator_generation_config = data[eval_name].get(
        "evaluator_generation_config", default_config
    )

    print("candidate_prompt: ", candidate_prompt)
    print("evaluator_prompt: ", evaluator_prompt)
    print("candidate_generation_config: ", candidate_generation_config)
    print("evaluator_generation_config: ", evaluator_generation_config)

    return (
        candidate_prompt,
        evaluator_prompt,
        candidate_generation_config,
        evaluator_generation_config,
    )


def get_tokenizer_and_model(model_name: str, cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/tokenizer",
        pad_token_id=0,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/model",
        torch_dtype=torch.float16,
        device_map="auto",
        offload_buffers=True,
    )
    return tokenizer, model


def tokenize(prompt, tokenizer):
    tokenized = tokenizer(prompt, return_tensors="pt")
    return tokenized


def generate_and_tokenize_prompt(batch, tokenizer, prompt_template):
    # print(batch)
    prompts = [prompt_template.format(d[0], d[1]) for d in zip(batch["instruction"], batch["input"])]
    print("a", prompts)
    tokenized_prompts = tokenizer(prompts, padding=True, return_tensors="pt")
    print("b", tokenized_prompts)
    return tokenized_prompts


def eval_prompt_tokenizer(generated, output, eval_tokenizer, prompt=None):
    prompt = prompt.format(generated, output)
    tokenized_full_prompt = tokenize(prompt, tokenizer=eval_tokenizer)
    return tokenized_full_prompt


def extract_score(text):
    match = re.search(r"\b\d+\.\d+\b", text)
    return float(match.group(0)) if match else -1.0


def log2json(results, json_result):
    with open(json_result, "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

def generate_response(model, tokenizer, input_ids, attention_mask, generation_config):
    # torch.LongTensor(input_ids).to(model.device)
    # torch.LongTensor(attention_mask).to(model.device)
    # try:
        output = model.generate(
            input_ids=torch.stack(input_ids).to(model.device),
            attention_mask=torch.stack(attention_mask).to(model.device),
            eos_token_id=tokenizer.eos_token_id,
            **generation_config,
        )
        print(output[0])
        response_ids = output[0][len(input_ids[0]) :]
        response = tokenizer.decode(response_ids, skip_special_tokens=True)
        return response, output
    # except RuntimeError as e:
    #     if "inf" in str(e) or "nan" in str(e):
    #         print(f"Skipping example due to invalid output: {e}")
    #         return None
    #     else:
    #         raise 



In [None]:
output_dir=f"./out"
cache_dir=f"/dpc/kunf0097/l3-8b"
eval_data_path="./data/1/eval_sample.json"
log_file=None
name="meta-llama/Meta-Llama-3-8B-Instruct"
eval_name="meta-llama/Meta-Llama-3-8B-Instruct"
run_id=datetime.now().strftime("%y%m%d%H%M%S")
log2wandb: bool = True
project="huggingface"
entity="my-ku-org"
evals_per_example=2
batch_size=2

In [None]:
candidate_name="meta-llama/Meta-Llama-3-8B-Instruct"
evaluator_name="meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
(
    candidate_prompt,
    evaluator_prompt,
    candidate_generation_config,
    evaluator_generation_config,
) = get_prompts_from_template("template.yaml", candidate_name, evaluator_name)


if log2wandb and (project is None or entity is None):
    raise ValueError("Both 'project' and 'entity' must be set if 'log2wandb' is True.")

if log_file is None:
    log_file = f"{output_dir}/results_{name.split('/')[1]}_{run_id}.json"

inspectt(inspect.currentframe())

In [None]:
# evaluator_tokenizer, evaluator_model = get_tokenizer_and_model(
#     model_name=eval_name, cache_dir=cache_dir
# )

candidate_tokenizer, candidate_model = get_tokenizer_and_model(
    model_name=name, cache_dir=cache_dir
)

# candidate_tokenizer = AutoTokenizer.from_pretrained(
#         name,
#         cache_dir=f"{cache_dir}/tokenizer",
#         pad_token_id=0,
#     )



In [None]:
candidate_tokenizer.pad_token = candidate_tokenizer.bos_token 
candidate_tokenizer.padding_side = "left"

In [None]:
data = load_dataset("json", data_files=eval_data_path)
eval_dataset = data["train"].map(
    lambda x: generate_and_tokenize_prompt(x, candidate_tokenizer, candidate_prompt),
    batched=True,  # Process in batches
    batch_size=batch_size
)

In [None]:
batched_eval_dataset  =torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)

In [None]:
for batch in batched_eval_dataset :
    print(batch)
    break

In [None]:
torch.stack(batch["input_ids"])[:,0]

In [None]:
torch.stack(batch["attention_mask"])[:,1]

In [None]:
print(candidate_tokenizer.decode(torch.stack(batch["input_ids"])[:,1]))

In [None]:
response, output = generate_response(
    candidate_model,
    candidate_tokenizer,
    batch["input_ids"],
    batch["attention_mask"],
    candidate_generation_config,
)

In [None]:
output[0].shape

In [None]:
print(candidate_tokenizer.decode(output[0]))

### enshiallah

In [None]:
import json
with open("out/results_240623023136_240628153415.json", "r") as f:
    results = json.load(f)

In [None]:
import wandb
table = wandb.Table(columns=list(results[0].keys()))
for r in results:
    table.add_data(*r.values())
run =wandb.init(project="huggingface", entity="my-ku-org", name="laaj-llama-3-8b-medical-v240623023136")
wandb.log({"Evaluation Results": table})
wandb.finish()

### Evals **MMLU**

In [None]:
from datasets import load_dataset

In [None]:
data =load_dataset("cais/mmlu", "clinical_knowledge")

In [None]:
eval_dataset = data["dev"]

In [None]:
eval_dataset[0]

### FT

In [1]:
# %%
import socket
print(socket.gethostname())

# %%
import gc
import inspect
import os
import logging
from datetime import datetime
from time import time

import fire
import huggingface_hub
import torch
import transformers
import wandb
from datasets import load_dataset
from dotenv import load_dotenv
import yaml
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from trl import SFTTrainer, SFTConfig

from peft import LoraConfig, PeftModel
from utils.eval_helper import inspectt, logg
from utils.ft_helper import (
    generate_and_tokenize_prompt,
    get_start_index,
    reorder_dataset,
)
from torch.utils.data import SequentialSampler

from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
wandb.require("core")

gpu-11-2


In [2]:
cache_dir: str = f"/dpc/kunf0097/l3-8b"
train_data_path: str = "./data/medical-36-row.json"
model_name: str = "EleutherAI/pythia-70m-deduped"
model_save_path: str = None
run_id: str = datetime.now().strftime("%y%m%d%H%M%S")
chpt_dir: str = None
last_checkpoint: str = None
start_index: int = 0
cutoff_len: int = 298
per_device_train_batch_size: int = 1
gradient_accumulation_steps: int = 1
world_size: int = None
local_rank: int = None

load_dotenv()
logger = logging.getLogger(__name__)

HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE")
if HF_TOKEN_WRITE is not None:
    huggingface_hub.login(token=HF_TOKEN_WRITE)

if model_save_path is None:
    model_save_path = f"{cache_dir}/model/{model_name}-v{run_id}"

if chpt_dir is None:
    chpt_dir = f"{cache_dir}/chpt/{run_id}"

if os.path.isdir(chpt_dir):
    checkpoints = [d for d in os.listdir(chpt_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        last_checkpoint = os.path.join(
            chpt_dir, max(checkpoints, key=lambda cp: int(cp.split("-")[-1]))
        )

# if train_data_path locally exists use it
if os.path.exists(train_data_path):
    data = load_dataset("json", data_files=train_data_path, split="train")
else:
    data = load_dataset(train_data_path, split="train")

if last_checkpoint is not None:
    start_index = get_start_index(last_checkpoint, len(data))

device_map = {"": 0}
world_size = int(os.environ.get("WORLD_SIZE", 1))
local_rank = int(os.environ.get("LOCAL_RANK", 0))

ddp = world_size != 1
if ddp:
    device_map = {"": local_rank}
    gradient_accumulation_steps = gradient_accumulation_steps // world_size

inspectt(inspect.currentframe())

with open(f"tuning.yaml", "r") as f:
    ft_config = yaml.safe_load(f)[model_name]
    assert "training_args" in ft_config, "Training arguments are not defined in tuning.yaml"
    assert "peft_args" in ft_config, "Peft arguments are not defined in tuning.yaml"
    print(ft_config)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/kunet.ae/ku5001069/.cache/huggingface/token
Login successful
------------------------  ---------------------------
------------------------  ---------------------------
{'prompt': '### System:\n{}<|endoftext|>\n### User:\n{}<|endoftext|>\n### Assistant:\n', 'response': '{}<|endoftext|>\n', 'peft_args': {'r': 4, 'lora_alpha': 16, 'target_modules': ['dense', 'dense_4h_to_h', 'dense_h_to_4h', 'query_key_value'], 'lora_dropout': 0.05, 'bias': 'none', 'task_type': 'CAUSAL_LM'}, 'training_args': {'warmup_ratio': 0.1, 'num_train_epochs': 3, 'learning_rate': 0.0003, 'fp16': False, 'logging_steps': 1, 'optim': 'adamw_torch', 'group_by_length': False, 'dataloader_drop_last': False, 'save_steps': 2, 'sav

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=f"{cache_dir}/tokenizer")

# Initialize model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,


    # 8bit

)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=f"{cache_dir}/model",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map=device_map,
    low_cpu_mem_usage=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
if tokenizer.pad_token is None:
    print("Tokenizer has no pad token. Adding it.")
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

Tokenizer has no pad token. Adding it.


In [5]:
if start_index != 0:
    data = reorder_dataset(data, start_index)
train_dataset = data.map(lambda x: generate_and_tokenize_prompt(x, tokenizer, ft_config, cutoff_len))

In [None]:
import torch
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []
    
    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing 
            print(name)

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])
    
    return layer_names

list(set(get_specific_layer_names(model)))

In [6]:
# Prepare LoRA
peft_args = ft_config["peft_args"]
peft_config = LoraConfig(**peft_args)


training_args = ft_config["training_args"]
train_config = SFTConfig(
    run_name=f"ft-{model_name.split('/')[1]}-{run_id}-v{start_index}",
    resume_from_checkpoint=last_checkpoint,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    output_dir=f"{chpt_dir}",
    max_seq_length=cutoff_len, # not sure its purpose since its setup on the tokenizer
    eval_accumulation_steps=1,  # !very important to send data to cpu
    **training_args
)

class SFTTrainerNoShuffle(SFTTrainer):
    def training_step(self, model, inputs):
        if (self.state.global_step % self.args.save_steps) == 0:
            inputs_decoded = tokenizer.decode(inputs["input_ids"][0])
            logger.critical(f"{self.state.global_step}: {inputs_decoded}")
        return super().training_step(model, inputs)

    def _get_train_sampler(self):
        return SequentialSampler(self.train_dataset)  # to prevent shuffling

    # [Not saving to trainer_state]
    def save_state(self):
        self.state.gradient_accumulation_steps = self.args.gradient_accumulation_steps
        super().save_state()

In [7]:
trainer = SFTTrainerNoShuffle(
    model=model,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, return_tensors="pt", padding=True
    ),
    peft_config=peft_config,
    train_dataset=train_dataset,
    args=train_config,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
# Train model
gc.collect()
gc.collect()

start = time()
if last_checkpoint is not None:
    logg("Resuming from checkpoint")
    trainer.train(last_checkpoint)
else:
    trainer.train()
end = time()
logg(f"Elapsed time: {end - start}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mamew0[0m ([33mmy-ku-org[0m). Use [1m`wandb login --relogin`[0m to force relogin


0: ### System:
If you are a doctor, please answer the medical questions based on the patient's description.<|endoftext|>
### User:
I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!<|endoftext|>
### Assistant:
Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional vertigo (BPPV), a type of peripheral vertigo. In this condition, the most common symptom is dizziness or giddiness, which is made worse with moveme

Step,Training Loss
1,15.1889
2,15.6644
3,24.4607
4,12.9248
5,9.8646
6,25.2304
7,14.4983
8,14.4873
9,9.7497
10,14.8846


2: ### System:
If you are a doctor, please answer the medical questions based on the patient's description.<|endoftext|>
### User:
hi my nine year old son had a cough and flu symptons three months ago and the chesty sounding cough and green phlegm still remains. it did seem to get better but never totally went and has now picked up again...he has only ever had antibiotics once in his live which suggests he his generally fitand well and active...never short of breath or weezy...so why would this be....<|endoftext|>
### Assistant:
Hi, If the symptoms persist that long this suggests the presence of an allergic element. To treat that kind of allergy you can give him an over the counter antihistamines once daily before going to bed. A cough suppressant as dextromethorphan and an expectorant as Murine will help reduce the cough. Make sure he Chat Doctor.  If the green phlegm persists he might require an antibiotic. Hope I have answered your query. Let me know if I can assist you further.<|en

------------------------ Elapsed time: 45.768951177597046 ---------------------------


### Eval / Chpt

In [None]:
import gc
import inspect
import os
import logging
from datetime import datetime
from time import time

import fire
import huggingface_hub
import torch
import transformers
import wandb
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from trl import SFTTrainer, SFTConfig

from peft import LoraConfig, PeftModel
from utils.eval_helper import inspectt, logg
from utils.ft_helper import (
    generate_and_tokenize_prompt,
    get_start_index,
    reorder_dataset,
)
from torch.utils.data import SequentialSampler

In [None]:
cache_dir: str = f"/dpc/kunf0097/l3-8b"
train_data_path: str = "meher146/medical_llama3_instruct_dataset"
model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct"
model_save_path: str = None
run_id: str = "240724111548"
chpt_dir: str = None
last_checkpoint: str = None
start_index: int = 0
per_device_train_batch_size: int = 4
gradient_accumulation_steps: int = 4
world_size: int = None
local_rank: int = None

if model_save_path is None:
    model_save_path = f"{cache_dir}/model/{model_name}-v{run_id}"

if chpt_dir is None:
    chpt_dir = f"{cache_dir}/chpt/{run_id}"

if os.path.isdir(chpt_dir):
    checkpoints = [d for d in os.listdir(chpt_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        last_checkpoint = os.path.join(
            chpt_dir, max(checkpoints, key=lambda cp: int(cp.split("-")[-1]))
        )

# if train_data_path locally exists use it
if os.path.exists(train_data_path):
    data = load_dataset("json", data_files=train_data_path, split="train")
else:
    data = load_dataset(train_data_path, split="train")

if last_checkpoint is not None:
    start_index = get_start_index(last_checkpoint, len(data))

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=f"{cache_dir}/model",
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(last_checkpoint, cache_dir=f"{cache_dir}/tokenizer")
model.resize_token_embeddings(len(tokenizer))

In [None]:
ftmodel = PeftModel.from_pretrained(model, last_checkpoint)
# ftmodel = ftmodel.merge_and_unload()

In [None]:
load_dotenv()
HF_TOKEN_WRITE = os.environ["HF_TOKEN_WRITE"]

In [None]:
ftmodel.push_to_hub(f"{model_name.split('/')[1]}-v{run_id}_si{start_index}-ada", token=HF_TOKEN_WRITE)

In [None]:

tokenizer.push_to_hub(f"{model_name.split('/')[1]}-v{run_id}_si{start_index}", token=HF_TOKEN_WRITE)
ftmodel.push_to_hub(f"{model_name.split('/')[1]}-v{run_id}_si{start_index}", token=HF_TOKEN_WRITE)

### Infer

In [None]:
model_name = "amew0/Meta-Llama-3-8B-Instruct-v240724111548_si110870"

In [None]:
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     cache_dir=f"{cache_dir}/model",
#     torch_dtype=torch.float16,
#     device_map="auto",
# )
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct", cache_dir=f"{cache_dir}/tokenizer"
)

In [None]:
train_data_path = "./data/medical-1-row.json"
data = load_dataset("json", data_files=train_data_path, split="train")

In [None]:
def eval_tokenize(prompt, tokenizer):
    tokenized = tokenizer(prompt, return_tensors='pt')
    # tokenized["input_ids"] = tokenized["input_ids"].flatten()
    # tokenized["attention_mask"] = tokenized["attention_mask"].flatten()
    return tokenized


def eval_generate_and_tokenize_prompt(data_point, tokenizer):
    # assistant_template = "<|start_header_id|>assistant<|end_header_id|>"
    # i = data_point["prompt"].find(assistant_template)
    # user_prompt = data_point["prompt"][:i+len(assistant_template)]
    # print(data_point["prompt"])
    tokenized_full_prompt = eval_tokenize(data_point["prompt"], tokenizer=tokenizer)
    return tokenized_full_prompt

eval_dataset = data.map(
    lambda x: eval_generate_and_tokenize_prompt(x, tokenizer)
)

In [None]:
eval_dataset = eval_dataset.with_format("torch")

In [None]:
eval_dataset["input_ids"]

In [None]:
example = data[0]
example = eval_generate_and_tokenize_prompt(example, tokenizer, prompt=None)

In [None]:
example.keys()

In [None]:
example = next(iter(eval_dataset))

In [None]:
tokenizer.decode(example["input_ids"][0])

In [None]:
o = model.generate(
    input_ids=example["input_ids"].to(model.device),
    attention_mask=example["attention_mask"].to(model.device),
    eos_token_id=tokenizer.eos_token_id,
    max_new_tokens=10
)

In [None]:
from utils.eval_helper import generate_response

response = generate_response(
    model,
    tokenizer,
    example["input_ids"],
    example["attention_mask"],
    {},
)

In [None]:
# base
response

In [None]:
# tuned
response

In [None]:
load_dotenv()
HF_TOKEN_WRITE = os.environ["HF_TOKEN_WRITE"]
tokenizer.push_to_hub(f"{model_name.split('/')[1]}-v{run_id}_si{start_index}", token=HF_TOKEN_WRITE)
ftmodel.push_to_hub(f"{model_name.split('/')[1]}-v{run_id}_si{start_index}", token=HF_TOKEN_WRITE)

In [4]:
from transformers import AutoTokenizer
cache_dir: str = f"/dpc/kunf0097/l3-8b"
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m-deduped", cache_dir=f"{cache_dir}/tokenizer")
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

1

### dig

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import torch.nn as nn
torch.manual_seed(26)

<torch._C.Generator at 0x14e579306570>

In [2]:
# cache_dir = "/dpc/kunf0097/l3-8b"
# tokenizer = AutoTokenizer.from_pretrained("gpt2", cache_dir=f"{cache_dir}/tokenizer")
# model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir=f"{cache_dir}/model")
config = AutoConfig.from_pretrained("gpt2")

In [25]:
# 1. Input Embeddings
# import transformers
torch.manual_seed(26)
input_text = "Hello, how are you?"
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"] # shape [1,6]
embeddings = model.transformer.wte(input_ids) # shape [1,6,768] # wte = word token embeddings

In [9]:
a=torch.tensor([[1,2,3],[4,5,6]]).split(2,dim=1)
a[0].shape

torch.Size([2, 2])

In [10]:
a

(tensor([[1, 2],
         [4, 5]]),
 tensor([[3],
         [6]]))

In [94]:
a,b = hidden_states.split(384, dim=2)
# a.shape
a.shape

torch.Size([1, 6, 384])

In [64]:
# wz torch
wte = nn.Embedding(config.vocab_size, config.n_embd)
input_embeds = wte(input_ids)

In [46]:
# 2. Positional Encoding
# Transformers are unaware of the order of tokens inherently, so they add positional encodings to embeddings.

position_ids = torch.arange(input_ids.size(-1), dtype=torch.long).unsqueeze(0) # [0,1,2,3,4,5] # typical range i.e position
positional_encodings = model.transformer.wpe(position_ids) # word positional encodings # shape [1,6,768] # postion_ids allowed are [0,1023]
# encoded_input = embeddings + positional_encodings # encoded_input has info about a tokens representation and position

In [66]:
# with torch
torch.manual_seed(26)
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long).unsqueeze(0) # [0,1,2,3,4,5] # typical range i.e position
wpe = nn.Embedding(config.max_position_embeddings, config.n_embd)
position_embeds = wpe(position_ids)

hidden_states = input_embeds + position_embeds
hidden_states.shape

torch.Size([1, 6, 768])

In [79]:
nn.Linear(768,768*3)(hidden_states).shape

torch.Size([1, 6, 2304])

In [80]:
# GPT2Block
# config.num_hidden_layers # 12
# nn.LayerNorm
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.43.3",
  "use_cache": true,
  "vocab_size": 50257
}

In [99]:
import matplotlib.pyplot as plt

plt.plot(model.transformer.wpe(torch.tensor(-1)).detach().numpy())
plt.show()

IndexError: index out of range in self

In [97]:
model.transformer.wpe(torch.tensor(0))

tensor([-1.8821e-02, -1.9742e-01,  4.0267e-03,  1.1347e-02,  6.3824e-02,
        -1.0501e-01,  3.6937e-02, -1.6803e-01, -4.9111e-02, -5.6461e-02,
        -2.4560e-03,  1.3503e-02, -4.1711e-03,  1.5115e-02,  1.6595e-02,
        -1.3808e-01, -6.3314e-03, -4.6150e-02,  2.6675e-02, -2.0417e-01,
         1.3454e-02, -3.6267e-02,  1.9301e-02, -2.5931e-02,  8.0243e-03,
         8.4712e-03, -1.9906e-02,  6.6802e-02,  7.1151e-03, -2.6618e-02,
         2.0829e-02, -3.3732e-02, -8.2898e-03,  9.8622e-03, -2.7369e-02,
        -9.9118e-02, -7.5254e-01,  2.3550e-02, -3.0513e-02,  7.7456e-02,
         3.4301e-03,  7.1132e-03,  2.6479e-02, -1.2113e-03,  1.1219e-01,
        -2.0606e-03, -2.2458e-02, -2.2287e-02,  2.3570e-02,  3.9777e-01,
         1.8856e-02,  2.0280e-02,  6.3043e-01,  2.3146e-02, -4.6894e-02,
         4.0653e+00, -1.7403e-02, -5.1683e-02,  7.2271e-02, -7.9312e-02,
         4.0248e-02,  1.9908e-02, -4.6380e-02, -2.8380e-02,  7.2535e-03,
         2.6772e-02,  1.4972e-03, -2.9892e-01, -1.1

In [36]:
positional_encodings.shape

torch.Size([1, 6, 768])