In [None]:
qnas = " \n\n### 1\nQ: What is the patient's liver condition based on the provided lab results?\nM: cirrhosis, chronic hepatitis, liver damage, liver failure\nA: chronic hepatitis\n\n### 2\nQ: What is the patient's serum albumin level?\nM: 3.4, 7.54, 11.3, 15.6\nA: 3.4\n\n### 3\nQ: Which of the following is a recommendation for the patient's treatment?\nM: avoid fatty diet, take beta blocker, take more sugar cane juice, consult gastroenterologist\nA: consult gastroenterologist\n\n### 4\nQ: What is the patient's bilirubin level?\nM: 2.38, 4.88, 17.16, 25.8\nA: 17.16\n\n### 5\nQ: What is the patient's creatinine level?\nM: 2.5, 4.88, 7.5, 10.5\nA: 4.88\n"

In [None]:
start_index = qnas.find("### 1")

qna_section = qnas[start_index:].strip()

# Skip the empty string before the first ###
qna_blocks = qna_section.split("### ")[1:]


qnas_formatted = []
for block in qna_blocks:
    lines = block.strip()
    q = lines[lines.find("Q: "):lines.find("A: ")].strip()
    a = lines[lines.find("A: "):].strip()
    
    qnas_formatted.append({"Q": q, "A":a})

In [None]:
qnas_formatted

In [None]:
qna_formatted

In [None]:
import torch
torch.cuda.is_available()

In [None]:
import os
import torch
import transformers
import huggingface_hub
import wandb
from scipy.stats import pearsonr
from datetime import datetime
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
import gc
import json
import yaml
import argparse
import re
from tqdm import tqdm
import fire
import inspect


logg = lambda x: print(f"------------------------ {x} ---------------------------")


def inspectt(frame):
    logg("")
    args, _, _, values = inspect.getargvalues(frame)
    for arg in args:
        print(f"\t{arg}: {values[arg]}")
    logg("")

def get_prompts_from_template(filepath, name, eval_name):
    default_config = {
        "max_new_tokens": 256,
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    }
    with open(filepath, "r") as f:
        data = yaml.safe_load(f)

    candidate_prompt = data[name]["candidate_prompt"]
    evaluator_prompt = data[eval_name]["evaluator_prompt"]
    candidate_generation_config = data[name].get("candidate_generation_config", default_config)
    evaluator_generation_config = data[eval_name].get(
        "evaluator_generation_config", default_config
    )

    print("candidate_prompt: ", candidate_prompt)
    print("evaluator_prompt: ", evaluator_prompt)
    print("candidate_generation_config: ", candidate_generation_config)
    print("evaluator_generation_config: ", evaluator_generation_config)

    return (
        candidate_prompt,
        evaluator_prompt,
        candidate_generation_config,
        evaluator_generation_config,
    )


def get_tokenizer_and_model(model_name: str, cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/tokenizer",
        pad_token_id=0,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/model",
        torch_dtype=torch.float16,
        device_map="auto",
        offload_buffers=True,
    )
    return tokenizer, model


def tokenize(prompt, tokenizer):
    tokenized = tokenizer(prompt, return_tensors="pt")
    return tokenized


def generate_and_tokenize_prompt(batch, tokenizer, prompt_template):
    # print(batch)
    prompts = [prompt_template.format(d[0], d[1]) for d in zip(batch["instruction"], batch["input"])]
    print("a", prompts)
    tokenized_prompts = tokenizer(prompts, padding=True, return_tensors="pt")
    print("b", tokenized_prompts)
    return tokenized_prompts


def eval_prompt_tokenizer(generated, output, eval_tokenizer, prompt=None):
    prompt = prompt.format(generated, output)
    tokenized_full_prompt = tokenize(prompt, tokenizer=eval_tokenizer)
    return tokenized_full_prompt


def extract_score(text):
    match = re.search(r"\b\d+\.\d+\b", text)
    return float(match.group(0)) if match else -1.0


def log2json(results, json_result):
    with open(json_result, "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

def generate_response(model, tokenizer, input_ids, attention_mask, generation_config):
    # torch.LongTensor(input_ids).to(model.device)
    # torch.LongTensor(attention_mask).to(model.device)
    # try:
        output = model.generate(
            input_ids=torch.stack(input_ids).to(model.device),
            attention_mask=torch.stack(attention_mask).to(model.device),
            eos_token_id=tokenizer.eos_token_id,
            **generation_config,
        )
        print(output[0])
        response_ids = output[0][len(input_ids[0]) :]
        response = tokenizer.decode(response_ids, skip_special_tokens=True)
        return response, output
    # except RuntimeError as e:
    #     if "inf" in str(e) or "nan" in str(e):
    #         print(f"Skipping example due to invalid output: {e}")
    #         return None
    #     else:
    #         raise 



In [None]:
output_dir=f"./out"
cache_dir=f"/dpc/kunf0097/l3-8b"
eval_data_path="./data/1/eval_sample.json"
log_file=None
name="meta-llama/Meta-Llama-3-8B-Instruct"
eval_name="meta-llama/Meta-Llama-3-8B-Instruct"
run_id=datetime.now().strftime("%y%m%d%H%M%S")
log2wandb: bool = True
project="huggingface"
entity="my-ku-org"
evals_per_example=2
batch_size=2

In [None]:
candidate_name="meta-llama/Meta-Llama-3-8B-Instruct"
evaluator_name="meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
(
    candidate_prompt,
    evaluator_prompt,
    candidate_generation_config,
    evaluator_generation_config,
) = get_prompts_from_template("template.yaml", candidate_name, evaluator_name)


if log2wandb and (project is None or entity is None):
    raise ValueError("Both 'project' and 'entity' must be set if 'log2wandb' is True.")

if log_file is None:
    log_file = f"{output_dir}/results_{name.split('/')[1]}_{run_id}.json"

inspectt(inspect.currentframe())

In [None]:
# evaluator_tokenizer, evaluator_model = get_tokenizer_and_model(
#     model_name=eval_name, cache_dir=cache_dir
# )

candidate_tokenizer, candidate_model = get_tokenizer_and_model(
    model_name=name, cache_dir=cache_dir
)

# candidate_tokenizer = AutoTokenizer.from_pretrained(
#         name,
#         cache_dir=f"{cache_dir}/tokenizer",
#         pad_token_id=0,
#     )



In [None]:
candidate_tokenizer.pad_token = candidate_tokenizer.bos_token 
candidate_tokenizer.padding_side = "left"

In [None]:
data = load_dataset("json", data_files=eval_data_path)
eval_dataset = data["train"].map(
    lambda x: generate_and_tokenize_prompt(x, candidate_tokenizer, candidate_prompt),
    batched=True,  # Process in batches
    batch_size=batch_size
)

In [None]:
batched_eval_dataset  =torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)

In [None]:
for batch in batched_eval_dataset :
    print(batch)
    break

In [None]:
torch.stack(batch["input_ids"])[:,0]

In [None]:
torch.stack(batch["attention_mask"])[:,1]

In [None]:
print(candidate_tokenizer.decode(torch.stack(batch["input_ids"])[:,1]))

In [None]:
response, output = generate_response(
    candidate_model,
    candidate_tokenizer,
    batch["input_ids"],
    batch["attention_mask"],
    candidate_generation_config,
)

In [None]:
output[0].shape

In [None]:
print(candidate_tokenizer.decode(output[0]))

### enshiallah

In [None]:
import json
with open("out/results_240623023136_240628153415.json", "r") as f:
    results = json.load(f)

In [None]:
import wandb
table = wandb.Table(columns=list(results[0].keys()))
for r in results:
    table.add_data(*r.values())
run =wandb.init(project="huggingface", entity="my-ku-org", name="laaj-llama-3-8b-medical-v240623023136")
wandb.log({"Evaluation Results": table})
wandb.finish()

### Evals **MMLU**

In [None]:
from datasets import load_dataset

In [None]:
data =load_dataset("cais/mmlu", "clinical_knowledge")

In [None]:
eval_dataset = data["dev"]

In [None]:
eval_dataset[0]

### FT

In [1]:
import gc
import inspect
import os
from datetime import datetime
from time import time

import fire
import huggingface_hub
import torch
import transformers
import wandb
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer, SFTConfig

from peft import LoraConfig, PeftModel
from utils.eval_helper import inspectt, logg
from transformers import TrainerCallback
# from utils.ft_helper import generate_and_tokenize_prompt, dummy

In [12]:
output_dir = f"./out"
cache_dir = f"/dpc/kunf0097/l3-8b"
train_data_path = "./data/medical-36-row.json"
model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct"
model_save_path: str = None
chpt_dir: str = None
# run_id = datetime.now().strftime("%y%m%d%H%M%S")
run_id = '240714024529'

if model_save_path is None:
    model_save_path = f"{cache_dir}/model/{model_name}-v{run_id}"
if chpt_dir is None:
    chpt_dir = f"{cache_dir}/chpt/{run_id}"

In [13]:
last_checkpoint = None
if os.path.isdir(chpt_dir):
    checkpoints = [d for d in os.listdir(chpt_dir) if d.startswith("checkpoint-")]
    if checkpoints:
        last_checkpoint = os.path.join(chpt_dir, max(checkpoints, key=lambda cp: int(cp.split('-')[-1])))

In [14]:
def tokenize(prompt, tokenizer, cutoff_len: int = None):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding="max_length",
        return_tensors="pt",
    )
    result["input_ids"] = result["input_ids"].flatten()
    result["attention_mask"] = result["attention_mask"].flatten()

    result["labels"] = result["input_ids"].clone()  # Clone input_ids for labels
    return result

def generate_and_tokenize_prompt(data_point, tokenizer, cutoff_len: int = None):
    if cutoff_len is None:
        tokenized_full_prompt = tokenize(data_point["prompt"], tokenizer=tokenizer)
    else:
        tokenized_full_prompt = tokenize(
            data_point["prompt"], tokenizer=tokenizer, cutoff_len=cutoff_len
        )
    return tokenized_full_prompt


In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=f"{cache_dir}/tokenizer")

# Initialize model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=f"{cache_dir}/model",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    # return_dict=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(128257, 4096)

In [17]:
# Prepare model for LoRA training
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [18]:
data = load_dataset("json", data_files=train_data_path, split="train") 
# not shuffling for now


In [19]:
from datasets import concatenate_datasets

def reorder_dataset(dataset, start_index):
    # Split the dataset into two parts: before and after the start index
    dataset_part1 = dataset.select(range(start_index, len(dataset)))
    dataset_part2 = dataset.select(range(start_index))
    
    # Concatenate the two parts to get the reordered dataset
    reordered_dataset = concatenate_datasets([dataset_part1, dataset_part2])
    return reordered_dataset


In [20]:
per_device_train_batch_size = 2
gradient_accumulation_steps = 1
start_index = 0
if last_checkpoint is not None:
    start_index = (
        int(last_checkpoint.split("-")[-1])
        * per_device_train_batch_size
        * gradient_accumulation_steps
    )

data = load_dataset("json", data_files=train_data_path, split="train")
data = reorder_dataset(data, start_index)

cutoff_len = 296  # (75% of the data wont be affected)
train_dataset = data.map(lambda x: generate_and_tokenize_prompt(x, tokenizer, cutoff_len))

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [21]:
start_index

16

In [23]:
# load it from .yaml
train_args = SFTConfig(
    run_name=f"ft-{model_name.split('/')[1]}-{run_id}-v{last_checkpoint.split('-')[-1]}",
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,  # only 1 is allowed on the no shuffler [needs revision]
    eval_accumulation_steps=1,  # !very important to send data to cpu
    warmup_steps=1,
    num_train_epochs=3,
    learning_rate=3e-4,
    fp16=False,
    logging_steps=1,
    optim="adamw_torch",
    output_dir=f"{chpt_dir}",
    group_by_length=False,
    dataloader_drop_last=False,
    save_steps=2,
    save_total_limit=3,
    max_seq_length=cutoff_len,
    resume_from_checkpoint=last_checkpoint,
)

In [31]:
a = iter(train_dataset)

In [34]:
b = next(a)
b['input']

'Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year. I am having some major bilateral temple pain along with numbness that comes and goes in my left arm/hand/fingers. I have had headaches since the aneurysm, but this is different. Also, my moods have been horrible for the past few weeks.'

In [72]:
c = next(a)
c['input']

StopIteration: 

In [24]:
class PrintExampleCallback(TrainerCallback):
    def __init__(self):
        self.example_iter = None  # Iterator for examples

    def on_train_begin(self, args, state, control, **kwargs):
        self.example_iter = iter(kwargs['train_dataloader'])

    def on_save(self, args, state, control, **kwargs):
        # Get the next example from the iterator
        try:
            example = next(self.example_iter)
        except StopIteration:
            # Reinitialize the iterator if it's exhausted and get the next example
            self.example_iter = iter(kwargs['train_dataloader'])
            example = next(self.example_iter)

        # Decode and print the example
        print(f"Step {state.global_step}: {tokenizer.decode(example['input_ids'][0], skip_special_tokens=True)}")


In [25]:
from transformers import Trainer
from torch.utils.data import DataLoader, SequentialSampler
import datasets
from transformers.trainer_utils import seed_worker

class SFTTrainerNoShuffle(SFTTrainer):
    def _get_train_sampler(self):

        # Always use SequentialSampler to prevent shuffling
        return SequentialSampler(self.train_dataset)
        

    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training DataLoader.

        Will use no sampler if `train_dataset` does not implement `__len__`, a sequential sampler otherwise.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator
        if isinstance(train_dataset, datasets.Dataset):
            train_dataset = self._remove_unused_columns(train_dataset, description="training")
        else:
            data_collator = self._get_collator_with_removed_columns(
                data_collator, description="training"
            )

        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = self._get_train_sampler()
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker
            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))


In [26]:
trainer = SFTTrainerNoShuffle(
# trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=peft_config,
    train_dataset=train_dataset,
    args=train_args,
    callbacks=[PrintExampleCallback()]
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [27]:
run_id

'240714024529'

In [28]:
trainer.train(last_checkpoint)

[34m[1mwandb[0m: Currently logged in as: [33mamew0[0m ([33mmy-ku-org[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step 8: system If you are a doctor, please answer the medical questions based on the patient's description.user This is the question: I had a alt reading about 2+ months ago of 43 then retested today and got a 60 alt reading my other readings came back normal should I be concerned?Im 40 years 5ft 3 and weigh 192albs have a history of chronic vertigo and insomnia, and chronic sinus problems. I take valium 5 mg 4 times a day, fluticansone 2 puffs each nose once a day, qvair two puffs twice a day to prevent weasing, claritin for allergies, and I do sinus rinses as needed. I am female.My doctor says none of the medicine has liver side effects, and I dont drink alcohol but a sip on the rare occasion. He thinks it could be do my weight and Im working on this with gradual exercise when Im able too. Should I be worried with this test result? I had one other result that was high my triglcyerides read 198? WWW.WWWW.WWassistant Hello and thank you for asking Chat Doctor. I have read your report, 

Step,Training Loss
9,2.4464
10,2.9583
11,2.2871
12,2.0716
13,2.3377
14,2.194
15,2.2658


Step 9: system If you are a doctor, please answer the medical questions based on the patient's description.user This is the question: Hi, I had a subarachnoid bleed and coiling of brain aneurysm last year. I am having some major bilateral temple pain along with numbness that comes and goes in my left arm/hand/fingers. I have had headaches since the aneurysm, but this is different. Also, my moods have been horrible for the past few weeks.assistant Aneurysm in brain causes headache due to compression of pain sensitive structure in brain. But pain is usually unilateral not bilateral. This pain will not radiate in both upper limb. Bilateral upper limb pain may be due to cervical spondylosis or other causes. You have anxiety of aneurysm, so due to anxiety your mood is horrible because aneurysm does not cause horrible mood change. So your headache may be chronic tension type headache. You may relieve by antianxiety Chat Doctor.




Step 10: system If you are a doctor, please answer the medical questions based on the patient's description.user This is the question: for the pst  6 days  i have been haing  upper  abdome  distress...  i have had gall bladder  surgery  2 yrs  ago..in the past  2  yrs  when i get these bouts .. i go to  emerg  they run cardiograms  xrays,blood work,all comes back normal ...im stressing out  over this  as im  scared every time it happens  im having a heart  attack  wht is it ?assistant Hi. Thanks for your query, read and understood your problems. You are getting bouts of pain in the upper abdomen within the last 2 years particularly after the gall bladder surgery. This is great news that cardiac problems are ruled out by the tests every time in the emergency Room. This means that we have to find a local cause in the abdomen itself. I would suggest the following
Step 11: system If you are a doctor, please answer the medical questions based on the patient's description.user This is the qu



Step 12: system If you are a doctor, please answer the medical questions based on the patient's description.user This is the question: Hi hope you can helpive been getting chest pains, not severe but a constant pain, can happen when im just resting but has also happened when im exercising, it continues through to my back as well. I sometimes wake up with chest pains too. i have a strong history of heart disease. grandfather died at 26 from a heat attack and mother had one at 45, also grandmother on the other side had a heart attack. Could you please explain what is happening? Thanksassistant HelloThanks for posting at Chat Doctor. You have not provided your age on the details. I would have been able to help u better had you mentioned your age. You have complaints of chest pain radiating to back with strong family history. So in your case it is better that we evaluate you for presence of heart disease. I recommend an ECG and 2d echo initially. If both these tests are normal you can proc



Step 14: system If you are a doctor, please answer the medical questions based on the patient's description.user This is the question: my mother is 80 yrs. detected with vulva cancer. vulva is swollen red,around discolourisation, white growth now extending to rectum. biopsy shows keratinizing squamous carcinoma vulva(welldifferentiated,invasive).she is little asthmatic, has cervical spondeolytis.assistant Hi, dairy have gone through your question. I can understand your concern. She has well differentiated keratinizing squamous cell carcinoma.  If her general health is good then treatment of choice is wide excision of carcinoma followed by chemotherapy or radiotherapy if needed. Consult your doctor and take treatment accordingly. Hope I have answered your question, if you have doubt then I will be happy to answer. Thanks for using Chat Doctor. Wish you a very good health.
Step 15: system If you are a doctor, please answer the medical questions based on the patient's description.user Thi



KeyboardInterrupt: 