## Import

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# "0,1,2,3"
num_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
print("num_gpus:", num_gpus)
print("CUDA_VISIBLE_DEVICES:", os.environ["CUDA_VISIBLE_DEVICES"])

In [None]:
import re
import sys
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModel
from vllm import LLM, SamplingParams

In [None]:
from dataset.process import format_chat

In [None]:
def seed_everything(seed):
    seed = int(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)
    print(f"seed everything: {seed}")

seed = 42
seed_everything(seed=seed)

## Task

## Tokenizer

In [None]:
# name of model
with open("dict_model_path.json", "r") as f:
    dict_model_path = json.load(f)

In [None]:
model_name = 'Llama-3.1-8B-Instruct'
path_dir_model = dict_model_path[model_name]
path_dir_model

In [None]:
# # loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(path_dir_model, padding_side='left')

## Task class

In [None]:
class EmptyArgs:
    def __init__(self):
        self.batch_size = 32
        self.gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
        self.model_path = path_dir_model
        self.model_name = model_name

args = EmptyArgs()

In [None]:
from model.init import load_config

load_config(args)
args.__dict__

In [None]:
task_name = "BrainMRI-AIS"

In [None]:
path_file_data = f"dataset_raw/{task_name}.SFT.json"
print(f"Loading {path_file_data} ...")
with open(path_file_data, "r") as file:
    list_dict_data = json.load(file)
list_dict_data = [
    dict_data for dict_data in list_dict_data if dict_data["split"] == "test"
]
print(f"The number of data: {len(list_dict_data)}")

In [None]:
list_dict_data[0]

In [None]:
from dataset.classification import Task_clf_Brain_MRI_AIS
task = Task_clf_Brain_MRI_AIS(args=args, task=task_name)

In [None]:
task.setup(tokenizer=tokenizer, prompt_mode='direc')

## Model

Choose the way to load the model:
- Huggingface model loading
- Huggingface pipeline
- vLLM

### Huggingface

In [None]:
model = AutoModel.from_pretrained(path_dir_model, torch_dtype=torch.bfloat16, device_map='auto')

In [None]:
# set the model to the evaluation mode
model.eval()
# greedy decoding
model.generation_config.do_sample = False
model.generation_config.temperature = None
model.generation_config.top_k = None
model.generation_config.top_p = None

In [None]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
    print("Tokenizer: Now pad_token_id is:", tokenizer.pad_token_id)
else:
    print("Tokenizer: pad_token_id is already set:", tokenizer.pad_token_id)
if model.generation_config.pad_token_id is None:
    model.generation_config.pad_token_id = tokenizer.eos_token_id
    print("Model: Now pad_token_id is:", model.generation_config.pad_token_id)
else:
    print("Model: pad_token_id is already set:", model.generation_config.pad_token_id)

### HF Pipeline

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model=path_dir_model, device="cuda")

In [None]:
str_input = "The medical condition is characterized by"
generated_text = pipe(str_input, num_return_sequences=1)

### vLLM 

In [None]:
if "mistral" in model_name.lower() and "biomistral" not in model_name.lower():
    print(f"Loading {model_name} with mistral mode...")
    model = LLM(model=path_dir_model, tensor_parallel_size=num_gpus, dtype="bfloat16", seed=seed, max_model_len=args.max_token_all, tokenizer_mode="mistral", load_format="mistral",
    config_format="mistral")
else:
    print(f"Loading {model_name} ...")
    model = LLM(model=path_dir_model, tensor_parallel_size=num_gpus, dtype="bfloat16", seed=seed, max_model_len=args.max_token_all, gpu_memory_utilization=0.9, enforce_eager=True)

In [None]:
sampling_params = SamplingParams(seed=seed, temperature=0, max_tokens=args.max_token_output)

## Inference - test

In [None]:
idx = 0
data = list_dict_data[idx]
data

In [None]:
formatted_input= format_chat(
    model_name=model_name,
    tokenizer=tokenizer,
    data=data,
    max_token_input=args.max_token_input,
    examples=task.examples,
)

In [None]:
print(formatted_input)

In [None]:
output = model.generate(formatted_input, sampling_params=sampling_params, use_tqdm=False)
for output_one in output:
    generated_text = output_one.outputs[0].text
    print(f"Prompt:\n\t{formatted_input}")
    print("-"*50)
    print(f"Generated text:\n\t{generated_text}")

## Inference - task

In [None]:
num_sample = 100

In [None]:
# format the input text with the prompt
list_input = []
list_num_token = []
for idx_data, dict_data in enumerate(list_dict_data[:num_sample]):
    input_llm = format_chat(
        model_name=model_name,
        tokenizer=tokenizer,
        data=dict_data,
        max_token_input=args.max_token_input,
        examples=task.examples,
    )
    list_input.append(input_llm)
    len_token_input = len(tokenizer.tokenize(input_llm))
    list_num_token.append(len_token_input)
    if len_token_input > args.max_token_input:
        print(f"Input exceeds max token limit: id-{idx_data} - {len_token_input} > {args.max_token_input}")
print(f"Data size: {len(list_input)}")

In [None]:
print(list_input[1])

In [None]:
dict_stat_num_token = pd.Series(list_num_token).describe().to_dict()
print(dict_stat_num_token)

In [None]:
# calculate how many data will be truncated, max_token_input = max_token_output
num_truncate = sum([1 for num_token in list_num_token if num_token > args.max_token_input])
proportion_truncate = num_truncate / len(list_num_token)
print(f"The number of data will be truncated: {num_truncate}")
print(f"The proportion of data will be truncated: {proportion_truncate:.2%}") 

In [None]:
plt.hist(list_num_token, bins=30, alpha=0.7)
plt.title('Token Count Distribution')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

In [None]:
list_pred = []
output = model.generate(list_input, sampling_params=sampling_params, use_tqdm=True)
for output_one in output:
    generated_text = output_one.outputs[0].text
    list_pred.append(generated_text)

## Result

In [None]:
for input_text, pred_text in zip(list_input, list_pred):
    print(f"Input text:\n\t{input_text}")
    print("-"*50)
    print(f"Generated text:\n\t{pred_text}")
    print("="*100)

In [None]:
for idx_data, dict_data in enumerate(list_dict_data[:num_sample]):
    dict_data["pred"] = list_pred[idx_data]

In [None]:
list_pred_extracted = task.get_pred(list_dict_data[:num_sample], prompt_mode="direct")
list_label_extracted = task.get_label(list_dict_data[:num_sample], prompt_mode="direct")

### list_pred

In [None]:
list_pred_extracted, num_failed = task.get_pred_none(list_pred=list_pred_extracted[:num_sample], list_label=list_label_extracted[:num_sample])
print(f"The number of failed data: {num_failed} ({num_failed/num_sample:.2%})")

In [None]:
dict_performance = task.get_performance(list_pred_extracted, list_label_extracted)

In [None]:
dict_performance

### list_list_pred

In [None]:
list_pred_extracted, num_failed = task.get_pred_none(list_pred=list_pred_extracted[:num_sample], list_label=list_label_extracted[:num_sample])
print(f"The number of failed data: {num_failed} ({num_failed/num_sample:.2%})")

In [None]:
dict_performance, dict_performance_sample = task.get_performance(list_pred_extracted, list_label_extracted)

In [None]:
dict_performance

## End

In [None]:
print('Done!')