In [1]:
import torch
torch.cuda.is_available()

True

In [None]:
import os
import torch
import transformers
import huggingface_hub
import wandb
from scipy.stats import pearsonr
from datetime import datetime
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
import gc
import json
import yaml
import argparse
import re
from tqdm import tqdm
import fire
import inspect


logg = lambda x: print(f"------------------------ {x} ---------------------------")


def inspectt(frame):
    logg("")
    args, _, _, values = inspect.getargvalues(frame)
    for arg in args:
        print(f"\t{arg}: {values[arg]}")
    logg("")


def get_prompts_from_template(filepath, name, eval_name):
    with open(filepath, "r") as f:
        data = yaml.safe_load(f)
    return data[name]["chat_prompt"], data[eval_name]["evaluator_prompt"]


def get_tokenizer_and_model(model_name: str, cache_dir: str):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/tokenizer",
        pad_token_id=0,
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=f"{cache_dir}/model",
        torch_dtype=torch.float16,
        device_map="auto",
        offload_buffers=True,
    )
    return tokenizer, model


def tokenize(prompt, tokenizer):
    tokenized = tokenizer(prompt, return_tensors="pt")
    return tokenized


def generate_and_tokenize_prompt(data_point, tokenizer, prompt=None):
    prompt = prompt.format(data_point["instruction"], data_point["input"])
    tokenized_full_prompt = tokenize(prompt, tokenizer=tokenizer)
    return tokenized_full_prompt


def eval_prompt_tokenizer(generated, output, eval_tokenizer, prompt=None):
    prompt = prompt.format(generated, output)
    tokenized_full_prompt = tokenize(prompt, tokenizer=eval_tokenizer)
    return tokenized_full_prompt


def extract_score(text):
    match = re.search(r"\b\d+\.\d+\b", text)
    return float(match.group(0)) if match else -1.0


def log2json(results, json_result):
    with open(json_result, "w") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)


In [None]:
output_dir=f"./out"
cache_dir=f"/dpc/kunf0097/l3-8b"
eval_data_path="./data/1/eval_medical_2k.json"
log_file=None
name="meta-llama/Meta-Llama-3-8B-Instruct"
eval_name="meta-llama/Meta-Llama-3-8B-Instruct"
run_id=datetime.now().strftime("%y%m%d%H%M%S")
log2wandb: bool = True
project="huggingface"
entity="my-ku-org"
evals_per_example=2
batch_size=4

In [None]:
chat_prompt, evaluator_prompt = get_prompts_from_template("template.yaml", name, eval_name)
print("chat_prompt: ", chat_prompt)
print("evaluator_prompt: ", evaluator_prompt)

if log2wandb and (project is None or entity is None):
    raise ValueError("Both 'project' and 'entity' must be set if 'log2wandb' is True.")

if log_file is None:
    log_file = f"{output_dir}/results_{name.split('/')[1]}_{run_id}.json"

inspectt(inspect.currentframe())

In [None]:
evaluator_tokenizer, evaluator_model = get_tokenizer_and_model(
    model_name=eval_name, cache_dir=cache_dir
)

candidate_tokenizer, candidate_model = get_tokenizer_and_model(
    model_name=name, cache_dir=cache_dir
)

In [None]:
data = load_dataset("json", data_files=eval_data_path)
eval_dataset = data["train"].map(
    lambda x: generate_and_tokenize_prompt(x, candidate_tokenizer, chat_prompt),
    batched=True,  # Process in batches
    batch_size=batch_size
)

In [None]:
for batch in eval_dataset:
    print(batch[0])
    break

In [None]:
import wandb

In [None]:
import json
with open("out/results_240623023136_240628153415.json", "r") as f:
    results = json.load(f)

In [None]:
results[0]

In [None]:
project="huggingface"
entity="my-ku-org"
wandb.init(project=project,entity=entity,name="llaaj-l3-8b-instruct")

In [None]:
from scipy.stats import pearsonr

for i, d in enumerate(results):

    rresults = results[: i + 1]

    # Transpose to do PCC easier
    results_t = list(zip(*[d["llm_scores"] for d in rresults]))
    avg_llm_scores = [d["avg_llm_score"] for d in rresults]

    pcc_results = {
        f"pcc_{i}_{j}": (
            pearsonr(results_t[i], results_t[j])[0] if len(results_t[i]) > 1 else 0
        )
        for i in range(len(results_t))
        for j in range(i + 1, len(results_t))
    }  # Calculate PCC for each pair of LLM scores

    avg_scores = {
        f"avg_llm_score_{i}": sum(scores) / len(scores) for i, scores in enumerate(results_t)
    }  # Calculate average scores for each set of LLM scores

    wandb.log(
        {
            "index": i,
            "avg_llm_score": d["avg_llm_score"],
            "llm_score_0": d["llm_scores"][0],
            "llm_score_1": d["llm_scores"][1],
            **avg_scores,
            **pcc_results,
            "run_score": sum(avg_llm_scores) / len(avg_llm_scores),
        }
    )

In [None]:
wandb.log(
    {
        **avg_scores,
        **pcc_results,
        "run_score": sum(avg_llm_scores) / len(avg_llm_scores),
    }
)

In [None]:
table = wandb.Table(columns=list(results[0].keys()))

In [None]:
results[0].keys()

In [None]:
list(r.values())

In [None]:
del table

In [None]:
for r in results:
    table.add_data(*r.values())

In [None]:
wandb.log({"Evaluation Results": table})

In [None]:
wandb.finish()

### enshiallah

In [None]:
import json
with open("out/results_240623023136_240628153415.json", "r") as f:
    results = json.load(f)

In [None]:
results[0]

{'expected': 'Hi thanks for contacting Chat Doctor.... Your liver enzymes are high....with increased bilirubin. Your albumin level low ....so you are having chronic liver problem.... You might have cirrhosis or chronic hepatitis.... For grading liver biopsy needed... Your physical examination must be done for splenomegaly and ascites..... If portal hypertension present beta blocker needed... For jaundice fruits taken more.... Excess fatty diet avoided.... Sugar cane juice, apple juice taken more.... Avoid strenuous work.... Consult gastroenterologist for detail examination and further opinion... Take care.... Chat Doctor.',
 'generated': ' Hello, Your mother is in stage of liver failure. She is having high bilirubin level, high transaminases and high alkaline phosphatase. Her electrolyte levels are also abnormal. She is having high urea levels and low sodium, potassium and chloride levels. She needs immediate attention. She should be taken to hospital and blood tests should be repeated

In [106]:
import wandb
table = wandb.Table(columns=list(results[0].keys()))

In [99]:
for r in results:
    table.add_data(*r.values())
    # wandb.log({"Evaluation Results": table})
    wandb.log(r)

Error: You must call wandb.init() before wandb.log()

In [107]:
run =wandb.init(project="huggingface", entity="my-ku-org", name="laaj-llama-3-8b-medical-v240623023136")

BrokenPipeError: [Errno 32] Broken pipe

In [62]:
wandb.log({"Evaluation Results": table})

In [95]:
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


wandb: ERROR Error while calling W&B API: failed to find run huggingface/4kqf62b4 (<Response [404]>)
wandb: ERROR Committing artifact failed. Artifact QXJ0aWZhY3Q6OTM5NDY5MTY1 won't be finalized.
wandb: ERROR It appears that you do not have permission to access the requested resource. Please reach out to the project owner to grant you access. If you have the correct permissions, verify that there are no issues with your networking setup.(Error 404: Not Found)
wandb: ERROR Error while calling W&B API: run huggingface/4kqf62b4 not found during createRunFiles (<Response [404]>)
wandb: ERROR Error while calling W&B API: failed to find run huggingface/4kqf62b4 (<Response [404]>)
wandb: ERROR Error while calling W&B API: run huggingface/4kqf62b4 not found during createRunFiles (<Response [404]>)
wandb: ERROR Error while calling W&B API: failed to find run huggingface/4kqf62b4 (<Response [404]>)
wandb: ERROR Error while calling W&B API: run huggingface/4kqf62b4 not found during createRunFiles

In [None]:
from scipy.stats import pearsonr

formatted = []
for i, r in enumerate(results):

    rresults = results[: i + 1]

    # Transpose to do PCC easier
    results_t = list(zip(*[d["llm_scores"] for d in rresults]))
    avg_llm_scores = [d["avg_llm_score"] for d in rresults]

    pcc_results = {
        f"pcc_{i}_{j}": (
            pearsonr(results_t[i], results_t[j])[0] if len(results_t[i]) > 1 else 0
        )
        for i in range(len(results_t))
        for j in range(i + 1, len(results_t))
    }  # Calculate PCC for each pair of LLM scores

    column_avg = {
        f"avg_llm_score_{i}": sum(scores) / len(scores) for i, scores in enumerate(results_t)
    }  # Calculate average scores for each set of LLM scores

    run_score = sum(avg_llm_scores ) / len(avg_llm_scores)

    
    d = {
        "expected": r["expected"],
        "generated": r["generated"],
        "scores": r["llm_scores"],
        "row_avg": r["avg_llm_score"],
        "no_scores": [],
        "running/pcc": pcc_results,
        "running/column_avg":column_avg,
        "running/run_score": run_score
    }

    formatted.append(d)