In [1]:
import torch
torch.cuda.is_available()

True

In [28]:
import os
import torch
import transformers
import huggingface_hub
import wandb
from scipy.stats import pearsonr
from datetime import datetime
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
import gc
import json
import argparse
import re
from tqdm import tqdm
import fire
import inspect


logg = lambda x: print(f"------------------------ {x} ---------------------------")

ME = "/dpc/kunf0097/l3-8b"


def inspectt(frame):
    args, _, _, values = inspect.getargvalues(frame)
    for arg in args:
        print(f"\t{arg}: {values[arg]}")
    logg("")


def get_tokenizer_and_model(model_name: str, cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/tokenizer",
        pad_token_id=0,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/model",
        torch_dtype=torch.float16,
        device_map="auto",
        offload_buffers=True,
    )
    return tokenizer, model


def tokenize(prompt, tokenizer):
    tokenized = tokenizer(prompt, return_tensors="pt")
    return tokenized


def generate_and_tokenize_prompt(data_point, tokenizer, prompt=None):
    if prompt is None:
        prompt = """<|start_header_id|>system<|end_header_id|> {}<|eot_id|><|start_header_id|>user<|end_header_id|> {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    prompt = prompt.format(data_point["instruction"], data_point["input"])
    tokenized_full_prompt = tokenize(prompt, tokenizer=tokenizer)
    return tokenized_full_prompt


def eval_prompt_tokenizer(generated, output, eval_tokenizer, prompt=None):
    prompt = prompt.format(generated, output)
    tokenized_full_prompt = tokenize(prompt, tokenizer=eval_tokenizer)
    return tokenized_full_prompt


def extract_score(text):
    match = re.search(r"\b\d+\.\d+\b", text)
    return float(match.group(0)) if match else -1.0


def log2json(results, json_result):
    # Write back the updated results
    with open(json_result, "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

In [8]:
output_dir = f"./out"
cache_dir = f"{ME}"
eval_data_path = "./data/1/eval_sample.json"
log_file = None
name = "meta-llama/Meta-Llama-3-8B-Instruct"
eval_name = "meta-llama/Meta-Llama-3-8B-Instruct"
run_id = datetime.now().strftime("%y%m%d%H%M%S")
log2wandb: bool = True
project = "huggingface"
entity = "my-ku-org"
eval_prompt_path = None
evals_per_example = 2

In [9]:
if log2wandb and (project is None or entity is None):
    raise ValueError("Both 'project' and 'entity' must be set if 'log2wandb' is True.")

if log_file is None:
    log_file = f"{output_dir}/results_{name.split('/')[1]}_{run_id}.json"

if eval_prompt_path is None:
    evaluator_prompt = """<|start_header_id|>system<|end_header_id|>
You are going to act as an LLM evaluator to rate the answer of the medical chatbot on factualness (i.e how contextually the generated output followed the expected reply). Penalize it appropriately for any hallucination, lost of context, or trailing repetition. YOUR RESPONSE IS NOTHING ELSE BUT A FLOAT FROM 0.0 - 5.0 (with format x.x). Where 0.0 indicates the context of the generated response is very far from the expected one. And 5.0 represents otherwise. AGAIN IF YOUR GENERATED ANYTHING ELSE BUT A FLOAT YOU'RE GOING TO CRUSH MY SYSTEM!!<|eot_id|><|start_header_id|>user<|end_header_id|> 
### Expected: {}
### Generated: {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
else:
    with open(eval_prompt_path, "r") as f:
        evaluator_prompt = f.read()

In [4]:
start = time()
load_dotenv()
HF_TOKEN_WRITE = os.getenv("HF_TOKEN_WRITE")
huggingface_hub.login(token=HF_TOKEN_WRITE)

torch.cuda.empty_cache()

logg(run_id)

evaluator_tokenizer, evaluator_model = get_tokenizer_and_model(
    model_name=eval_name, cache_dir=cache_dir
)

candidate_tokenizer, candidate_model = get_tokenizer_and_model(
    model_name=name, cache_dir=cache_dir
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/kunet.ae/ku5001069/.cache/huggingface/token
Login successful
------------------------ 240702151059 ---------------------------


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [34]:

data = load_dataset("json", data_files=eval_data_path)
eval_dataset = data["train"].map(
    lambda x: generate_and_tokenize_prompt(
        x, candidate_tokenizer, phi_prompt
    )
)  # not shuffled

In [35]:
eval_dataset[0]

{'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.",
 'input': 'my mothers age 58, jaundice problemcreatinine 4.88, sodium-125, potasium-4.6; chloride-85;bilirubin total-17.16;bilirubin direct-2.38;sgot-155;sgpt-160;alkaline phos-260;protein-7.54;albumin-3.4;urea-138 and BP 30nowplease help. what is status and in which stage she is?',
 'output': 'Hi thanks for contacting Chat Doctor.... Your liver enzymes are high....with increased bilirubin. Your albumin level low ....so you are having chronic liver problem.... You might have cirrhosis or chronic hepatitis.... For grading liver biopsy needed... Your physical examination must be done for splenomegaly and ascites..... If portal hypertension present beta blocker needed... For jaundice fruits taken more.... Excess fatty diet avoided.... Sugar cane juice, apple juice taken more.... Avoid strenuous work.... Consult gastroenterologist for detail examination and further opinion... Tak

In [38]:
example=eval_dataset[0]
example

{'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.",
 'input': 'my mothers age 58, jaundice problemcreatinine 4.88, sodium-125, potasium-4.6; chloride-85;bilirubin total-17.16;bilirubin direct-2.38;sgot-155;sgpt-160;alkaline phos-260;protein-7.54;albumin-3.4;urea-138 and BP 30nowplease help. what is status and in which stage she is?',
 'output': 'Hi thanks for contacting Chat Doctor.... Your liver enzymes are high....with increased bilirubin. Your albumin level low ....so you are having chronic liver problem.... You might have cirrhosis or chronic hepatitis.... For grading liver biopsy needed... Your physical examination must be done for splenomegaly and ascites..... If portal hypertension present beta blocker needed... For jaundice fruits taken more.... Excess fatty diet avoided.... Sugar cane juice, apple juice taken more.... Avoid strenuous work.... Consult gastroenterologist for detail examination and further opinion... Tak

In [40]:
outputs = candidate_model.generate(
    input_ids=torch.LongTensor(example["input_ids"]).to(candidate_model.device),
    attention_mask=torch.LongTensor(example["attention_mask"]).to(candidate_model.device),
    **generation_config,
)

You are not running the flash-attention implementation, expect numerical differences.


In [41]:
response_ids = outputs[0][len(example["input_ids"][0]) :]
response = candidate_tokenizer.decode(response_ids, skip_special_tokens=True)

In [6]:
if log2wandb:
    wandb.init(project=project, entity=entity)

# generation config
generation_config = {
    "max_new_tokens": 256,
    "eos_token_id": [
        candidate_tokenizer.eos_token_id,
        candidate_tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ],
    "do_sample": True,
    "temperature": 0.6,
    "top_p": 0.9,
}

results = []

In [7]:
for i, example in tqdm(enumerate(eval_dataset)):
    res = None
    outputs = candidate_model.generate(
        input_ids=torch.LongTensor(example["input_ids"]).to(candidate_model.device),
        attention_mask=torch.LongTensor(example["attention_mask"]).to(
            candidate_model.device
        ),
        **generation_config,
    )

    response_ids = outputs[0][len(example["input_ids"][0]) :]
    response = candidate_tokenizer.decode(response_ids, skip_special_tokens=True)
    gt_response = example["output"]  # groundtruth

    eval_prompt_tokenized = eval_prompt_tokenizer(
        response, gt_response, evaluator_tokenizer, prompt=evaluator_prompt
    )

    llm_scores = []
    for i in range(evals_per_example):
        eval_output = evaluator_model.generate(
            input_ids=torch.LongTensor(eval_prompt_tokenized["input_ids"]).to(
                candidate_model.device
            ),
            attention_mask=torch.LongTensor(eval_prompt_tokenized["attention_mask"]).to(
                candidate_model.device
            ),
            **generation_config,
        )

        eval_response = eval_output[0][len(eval_prompt_tokenized["input_ids"][0]) :]
        llm_score = evaluator_tokenizer.decode(eval_response, skip_special_tokens=True)
        llm_scores.append(extract_score(llm_score))

    res = {
        "expected": gt_response,
        "generated": response,
        "llm_scores": llm_scores,
        "avg_llm_score": sum(llm_scores) / len(llm_scores),
    }
    results.append(res)
    log2json(results, log_file)

    if log2wandb:
        wandb_log = {"index": i, "avg_llm_score": res["avg_llm_score"]}
        for j, score in enumerate(llm_scores):
            wandb_log[f"llm_score_{j}"] = score
        wandb.log(wandb_log)

    del example
    gc.collect()
    gc.collect()

0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
1it [00:29, 29.00s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
2it [01:17, 38.81s/it]


In [11]:
results

[{'expected': 'Hi thanks for contacting Chat Doctor.... Your liver enzymes are high....with increased bilirubin. Your albumin level low ....so you are having chronic liver problem.... You might have cirrhosis or chronic hepatitis.... For grading liver biopsy needed... Your physical examination must be done for splenomegaly and ascites..... If portal hypertension present beta blocker needed... For jaundice fruits taken more.... Excess fatty diet avoided.... Sugar cane juice, apple juice taken more.... Avoid strenuous work.... Consult gastroenterologist for detail examination and further opinion... Take care.... Chat Doctor.',
  'generated': "\n\nBased on the laboratory results, I'll provide an assessment of your mother's status and stage of liver disease.\n\n**Liver Function Tests (LFTs):**\n\n* Bilirubin: Total bilirubin is elevated (17.16 mg/dL), indicating liver dysfunction or obstruction. Direct bilirubin (2.38 mg/dL) is also elevated, suggesting that there is some degree of liver c

In [12]:
if log2wandb:
    results_t = list(
        zip(*[d["llm_scores"] for d in results])
    )  # Transpose to do PCC easier
    avg_llm_scores = [d["avg_llm_score"] for d in results]

    pcc_results = {
        f"pcc_{i}_{j}": pearsonr(results_t[i], results_t[j])[0]
        for i in range(len(results_t))
        for j in range(i + 1, len(results_t))
    }  # Calculate PCC for each pair of LLM scores

    avg_scores = {
        f"avg_llm_score_{i}": sum(scores) / len(scores)
        for i, scores in enumerate(results_t)
    }  # Calculate average scores for each set of LLM scores

    wandb.log(
        {
            **avg_scores,
            **pcc_results,
            "run_score": sum(avg_llm_scores) / len(avg_llm_scores),
        }
    )  # Log the calculated data to wandb

    wandb.finish()

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_llm_score_0,▁
avg_llm_score_1,▁
pcc_0_1,▁
run_score,▁

0,1
avg_llm_score_0,2.35
avg_llm_score_1,1.9
pcc_0_1,1.0
run_score,2.125


In [21]:
cache_dir

'/dpc/kunf0097/l3-8b'

In [22]:
phi_prompt = """<|system|>\n{}<|end|>\n<|user|>\n{}<|end|>\n<|assistant|>"""
candidate_tokenizer, candidate_model = get_tokenizer_and_model("microsoft/Phi-3-mini-4k-instruct", cache_dir)

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [26]:
print(phi_prompt)

<|system|>
{}<|end|>
<|user|>
{}<|end|>
<|assistant|>


In [24]:
eval_dataset

Dataset({
    features: ['instruction', 'input', 'output', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 2
})