In [15]:
qnas = " \n\n### 1\nQ: What is the patient's liver condition based on the provided lab results?\nM: cirrhosis, chronic hepatitis, liver damage, liver failure\nA: chronic hepatitis\n\n### 2\nQ: What is the patient's serum albumin level?\nM: 3.4, 7.54, 11.3, 15.6\nA: 3.4\n\n### 3\nQ: Which of the following is a recommendation for the patient's treatment?\nM: avoid fatty diet, take beta blocker, take more sugar cane juice, consult gastroenterologist\nA: consult gastroenterologist\n\n### 4\nQ: What is the patient's bilirubin level?\nM: 2.38, 4.88, 17.16, 25.8\nA: 17.16\n\n### 5\nQ: What is the patient's creatinine level?\nM: 2.5, 4.88, 7.5, 10.5\nA: 4.88\n"

In [21]:
start_index = qnas.find("### 1")

qna_section = qnas[start_index:].strip()

# Skip the empty string before the first ###
qna_blocks = qna_section.split("### ")[1:]


qnas_formatted = []
for block in qna_blocks:
    lines = block.strip()
    q = lines[lines.find("Q: "):lines.find("A: ")].strip()
    a = lines[lines.find("A: "):].strip()
    
    qnas_formatted.append({"Q": q, "A":a})

In [22]:
qnas_formatted

[{'Q': "Q: What is the patient's liver condition based on the provided lab results?\nM: cirrhosis, chronic hepatitis, liver damage, liver failure",
  'A': 'A: chronic hepatitis'},
 {'Q': "Q: What is the patient's serum albumin level?\nM: 3.4, 7.54, 11.3, 15.6",
  'A': 'A: 3.4'},
 {'Q': "Q: Which of the following is a recommendation for the patient's treatment?\nM: avoid fatty diet, take beta blocker, take more sugar cane juice, consult gastroenterologist",
  'A': 'A: consult gastroenterologist'},
 {'Q': "Q: What is the patient's bilirubin level?\nM: 2.38, 4.88, 17.16, 25.8",
  'A': 'A: 17.16'},
 {'Q': "Q: What is the patient's creatinine level?\nM: 2.5, 4.88, 7.5, 10.5",
  'A': 'A: 4.88'}]

In [4]:
qna_formatted

[{'Q': "What is the patient's liver condition based on the provided lab results?",
  'A': 'chronic hepatitis'},
 {'Q': "What is the patient's serum albumin level?", 'A': '3.4'},
 {'Q': "Which of the following is a recommendation for the patient's treatment?",
  'A': 'consult gastroenterologist'},
 {'Q': "What is the patient's bilirubin level?", 'A': '17.16'},
 {'Q': "What is the patient's creatinine level?", 'A': '4.88'}]

In [1]:
import torch
torch.cuda.is_available()

True

In [14]:
import os
import torch
import transformers
import huggingface_hub
import wandb
from scipy.stats import pearsonr
from datetime import datetime
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
import gc
import json
import yaml
import argparse
import re
from tqdm import tqdm
import fire
import inspect


logg = lambda x: print(f"------------------------ {x} ---------------------------")


def inspectt(frame):
    logg("")
    args, _, _, values = inspect.getargvalues(frame)
    for arg in args:
        print(f"\t{arg}: {values[arg]}")
    logg("")

def get_prompts_from_template(filepath, name, eval_name):
    default_config = {
        "max_new_tokens": 256,
        "do_sample": True,
        "temperature": 0.6,
        "top_p": 0.9,
    }
    with open(filepath, "r") as f:
        data = yaml.safe_load(f)

    candidate_prompt = data[name]["candidate_prompt"]
    evaluator_prompt = data[eval_name]["evaluator_prompt"]
    candidate_generation_config = data[name].get("candidate_generation_config", default_config)
    evaluator_generation_config = data[eval_name].get(
        "evaluator_generation_config", default_config
    )

    print("candidate_prompt: ", candidate_prompt)
    print("evaluator_prompt: ", evaluator_prompt)
    print("candidate_generation_config: ", candidate_generation_config)
    print("evaluator_generation_config: ", evaluator_generation_config)

    return (
        candidate_prompt,
        evaluator_prompt,
        candidate_generation_config,
        evaluator_generation_config,
    )


def get_tokenizer_and_model(model_name: str, cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/tokenizer",
        pad_token_id=0,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/model",
        torch_dtype=torch.float16,
        device_map="auto",
        offload_buffers=True,
    )
    return tokenizer, model


def tokenize(prompt, tokenizer):
    tokenized = tokenizer(prompt, return_tensors="pt")
    return tokenized


def generate_and_tokenize_prompt(batch, tokenizer, prompt_template):
    # print(batch)
    prompts = [prompt_template.format(d[0], d[1]) for d in zip(batch["instruction"], batch["input"])]
    print("a", prompts)
    tokenized_prompts = tokenizer(prompts, padding=True, return_tensors="pt")
    print("b", tokenized_prompts)
    return tokenized_prompts


def eval_prompt_tokenizer(generated, output, eval_tokenizer, prompt=None):
    prompt = prompt.format(generated, output)
    tokenized_full_prompt = tokenize(prompt, tokenizer=eval_tokenizer)
    return tokenized_full_prompt


def extract_score(text):
    match = re.search(r"\b\d+\.\d+\b", text)
    return float(match.group(0)) if match else -1.0


def log2json(results, json_result):
    with open(json_result, "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

def generate_response(model, tokenizer, input_ids, attention_mask, generation_config):
    # torch.LongTensor(input_ids).to(model.device)
    # torch.LongTensor(attention_mask).to(model.device)
    # try:
        output = model.generate(
            input_ids=torch.stack(input_ids).to(model.device),
            attention_mask=torch.stack(attention_mask).to(model.device),
            eos_token_id=tokenizer.eos_token_id,
            **generation_config,
        )
        print(output[0])
        response_ids = output[0][len(input_ids[0]) :]
        response = tokenizer.decode(response_ids, skip_special_tokens=True)
        return response, output
    # except RuntimeError as e:
    #     if "inf" in str(e) or "nan" in str(e):
    #         print(f"Skipping example due to invalid output: {e}")
    #         return None
    #     else:
    #         raise 



In [3]:
output_dir=f"./out"
cache_dir=f"/dpc/kunf0097/l3-8b"
eval_data_path="./data/1/eval_sample.json"
log_file=None
name="meta-llama/Meta-Llama-3-8B-Instruct"
eval_name="meta-llama/Meta-Llama-3-8B-Instruct"
run_id=datetime.now().strftime("%y%m%d%H%M%S")
log2wandb: bool = True
project="huggingface"
entity="my-ku-org"
evals_per_example=2
batch_size=2

In [4]:
candidate_name="meta-llama/Meta-Llama-3-8B-Instruct"
evaluator_name="meta-llama/Meta-Llama-3-8B-Instruct"

In [5]:
(
    candidate_prompt,
    evaluator_prompt,
    candidate_generation_config,
    evaluator_generation_config,
) = get_prompts_from_template("template.yaml", candidate_name, evaluator_name)


if log2wandb and (project is None or entity is None):
    raise ValueError("Both 'project' and 'entity' must be set if 'log2wandb' is True.")

if log_file is None:
    log_file = f"{output_dir}/results_{name.split('/')[1]}_{run_id}.json"

inspectt(inspect.currentframe())

candidate_prompt:  <|start_header_id|>system<|end_header_id|> {}<|eot_id|><|start_header_id|>user<|end_header_id|> {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
evaluator_prompt:  <|start_header_id|>system<|end_header_id|>
You are going to act as an LLM evaluator to rate the answer of the LLM generated medical chatbot on factualness (i.e how contextually the generated output followed the expected reply). Penalize it appropriately for any hallucination/loss of context, refraining to answer (saying 'As an AI Language Model', 'I am not a doctor, but' ...), or trailing repetition. YOUR RESPONSE IS NOTHING ELSE BUT A FLOAT FROM 0.0 - 5.0 (with format x.x). Where 0.0 indicates the context of the generated response is very far from the expected one. And 5.0 represents otherwise. AGAIN IF YOUR GENERATED ANYTHING ELSE BUT A FLOAT YOU'RE GOING TO CRUSH MY SYSTEM!!<|eot_id|>
<|start_header_id|>user<|end_header_id|>
### Expected: {}
### Generated: {}<|eot_id|>
<|start_header_id|>assist

In [6]:
# evaluator_tokenizer, evaluator_model = get_tokenizer_and_model(
#     model_name=eval_name, cache_dir=cache_dir
# )

candidate_tokenizer, candidate_model = get_tokenizer_and_model(
    model_name=name, cache_dir=cache_dir
)

# candidate_tokenizer = AutoTokenizer.from_pretrained(
#         name,
#         cache_dir=f"{cache_dir}/tokenizer",
#         pad_token_id=0,
#     )



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
candidate_tokenizer.pad_token = candidate_tokenizer.bos_token 
candidate_tokenizer.padding_side = "left"

In [8]:
data = load_dataset("json", data_files=eval_data_path)
eval_dataset = data["train"].map(
    lambda x: generate_and_tokenize_prompt(x, candidate_tokenizer, candidate_prompt),
    batched=True,  # Process in batches
    batch_size=batch_size
)

In [9]:
batched_eval_dataset  =torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size)

In [10]:
for batch in batched_eval_dataset :
    print(batch)
    break

{'instruction': ["If you are a doctor, please answer the medical questions based on the patient's description.", "If you are a doctor, please answer the medical questions based on the patient's description."], 'input': ['my mothers age 58, jaundice problemcreatinine 4.88, sodium-125, potasium-4.6; chloride-85;bilirubin total-17.16;bilirubin direct-2.38;sgot-155;sgpt-160;alkaline phos-260;protein-7.54;albumin-3.4;urea-138 and BP 30nowplease help. what is status and in which stage she is?', 'Whether heart disease can be cured by Homeopathic treatment. Heart if functioning upto 25 % and when medicine for heart beat improvement is taken then it affects Blood pressures and it gets low and when medicine for improving Blood pressure is teken then heart beat gets low. Twice patient was admitted to Kokilaben Ambani Hospital and he is under treatment of Dr. Jamshed Dalal in last one year. Please suggest whether this heart problem can be cured by taking Homeopathic treqtment. Also please suggest 

In [36]:
torch.stack(batch["input_ids"])[:,0]

tensor([128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000, 128000,
        128000, 128000, 128000, 128000, 128000, 128000, 128000, 128006,   9125,
        128007,   1442,    499,    527,    264,  10896,     11,   4587,   4320,
           279,   6593,   4860,   3196,    389,    279,   8893,    596,   4096,
            13, 128009, 128006,    882, 128007,    856,  27698,   4325,    220,
          2970,     11,  12203,   1263,    560,   3575,    846,  15111,    483,
           220,     19,     13,   2421,     11,  39695,     12,   6549,     11,
          3419,    300,   2411,     12,     19,     13,     21,     26,  82882,
            12,   5313,     26, 112474,    392,    258,   2860,     12,   1114,
            13,    845,     26, 112474,    392,    258,   2167,     12,     17,
            13,   1987,     26,   2034,    354,     12,   9992,     26,   2034,
           418,     12,   6330,     26,   1727,    278,    483,   1343,    437,
            12,  11387,     26,  79565, 

In [49]:
torch.stack(batch["attention_mask"])[:,1]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1])

In [42]:
print(candidate_tokenizer.decode(torch.stack(batch["input_ids"])[:,1]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|> If you are a doctor, please answer the medical questions based on the patient's description.<|eot_id|><|start_header_id|>user<|end_header_id|> Whether heart disease can be cured by Homeopathic treatment. Heart if functioning upto 25 % and when medicine for heart beat improvement is taken then it affects Blood pressures and it gets low and when medicine for improving Blood pressure is teken then heart beat gets low. Twice patient was admitted to Kokilaben Ambani Hospital and he is under treatment of Dr. Jamshed Dalal in last one year. Please suggest whether this heart problem can be cured by taking Homeopathic treqtment. Also please suggest name address of any Doctor at Mumbai who can provide Homeopathic treatment for curing the above disease.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


In [15]:
response, output = generate_response(
    candidate_model,
    candidate_tokenizer,
    batch["input_ids"],
    batch["attention_mask"],
    candidate_generation_config,
)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


tensor([128000, 128000,    791,   4409,    315,   9853,  12167,   6011,    315,
         21995,   8471,    374,   8647,    369,   8405,  10065,   2585,   3600,
           311,    279,   4409,    315,   9853,  12167,     13,    578,   9476,
           374,  11411,    311,  22923,    279,  23460,    315,  10099,    323,
           279,   4029,    555,   8405,    264,   8205,    315,   3600,     11,
          2737,  10065,   2585,     11,  25375,     11,    323,   6873,    627,
           791,   9476,  27149,   1403,  10065,  52888,     11,    279,   4410,
          9853,  12167,  21995,  75921,    323,    279,   6460,  13345,  21995,
         75921,     11,    902,    527,   1825,    311,    279,    586,    369,
         10065,   1008,   2945,     11,   5675,    323,   1766,   3600,     11,
           323,   1023,  10065,  14228,   7640,     13,    578,   9476,   1101,
           706,    264,   2128,    315,  10065,   2585,   9808,    889,   6013,
           311,   6880,    369,   2532, 

In [54]:
output[0].shape

torch.Size([258])

In [63]:
print(candidate_tokenizer.decode(output[0]))

<|begin_of_text|><|begin_of_text|>The City of Los Angeles Department of Animal Services is responsible for providing animal control services to the City of Los Angeles. The department is committed to promoting the welfare of animals and the community by providing a variety of services, including animal control, adoption, and education.
The department operates two animal shelters, the West Los Angeles Animal Shelter and the East Valley Animal Shelter, which are open to the public for animal adoptions, lost and found services, and other animal-related activities. The department also has a team of animal control officers who respond to calls for service and provide assistance with animal-related issues.
In addition to its shelter and animal control services, the department offers a variety of programs and services, including:
Animal adoption: The department has a wide range of animals available for adoption, including dogs, cats, rabbits, and other small mammals.
Lost and found: The depar

### enshiallah

In [None]:
import json
with open("out/results_240623023136_240628153415.json", "r") as f:
    results = json.load(f)

In [None]:
import wandb
table = wandb.Table(columns=list(results[0].keys()))
for r in results:
    table.add_data(*r.values())
run =wandb.init(project="huggingface", entity="my-ku-org", name="laaj-llama-3-8b-medical-v240623023136")
wandb.log({"Evaluation Results": table})
wandb.finish()

### Evals **MMLU**

In [1]:
from datasets import load_dataset

In [3]:
data =load_dataset("cais/mmlu", "clinical_knowledge")

Downloading data:   0%|          | 0.00/40.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.48k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/265 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [6]:
eval_dataset = data["dev"]

In [15]:
eval_dataset[0]

{'question': 'The energy for all forms of muscle contraction is provided by:',
 'subject': 'clinical_knowledge',
 'choices': ['ATP.', 'ADP.', 'phosphocreatine.', 'oxidative phosphorylation.'],
 'answer': 0}

### FT

In [24]:
logg = lambda x: print(f"------------------------ {x} ---------------------------")

import os
import gc
import torch
import transformers
import huggingface_hub
import wandb

wandb.require("core")
from datetime import datetime
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, PeftModel
from datasets import load_dataset
from time import time

In [26]:
start = time()

# Load environment variables
load_dotenv()

# Hugging Face token
HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE")

# Login to Hugging Face Hub
huggingface_hub.login(token=HF_TOKEN_WRITE)


torch.cuda.empty_cache()

# Set model and tokenizer paths
ME = "/dpc/kunf0097/l3-8b"
RUN_ID = datetime.now().strftime("%y%m%d%H%M%S")
logg(RUN_ID)
logg("ft-medical.py")
name = "unsloth/llama-3-8b-Instruct-bnb-4bit"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/kunet.ae/ku5001069/.cache/huggingface/token
Login successful
------------------------ 240711173326 ---------------------------
------------------------ ft-medical.py ---------------------------


In [27]:
model = AutoModelForCausalLM.from_pretrained(
    name,
    cache_dir=f"{ME}/l3-8b/model",
    # torch_dtype=torch.float16,
    device_map={"": 0},
    low_cpu_mem_usage=True,
    return_dict=True,
)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 31.75 GiB total capacity; 2.06 GiB already allocated; 19.69 MiB free; 2.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF