## Sandbox for LLM Metrics Scripts

In [1]:
### testing report generation with compute time tracking

import logging
import math
import os
import time
from typing import Dict, Any, List, Callable
import warnings

import torch
from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetName
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import logging as transformers_logging

from utils import (
    get_device,
    get_hardware_info,
    evaluate_latency_throughput,
    evaluate_power_efficiency,
    compare_precision_accuracy,
    memory_by_sequence_length,
)
from metrics import (
    calculate_perplexity,
    calculate_f1_score,
    calculate_precision_recall,
    calculate_mean_reciprocal_rank,
    calculate_mean_average_precision,
    calculate_brevity_score,
    #calculate_gpt_score,
    #calculate_ragas_score,
    #calculate_helm_score,
    #calculate_forgetting_rate,
)

## Suppress annoying jupyter notebook specific warnings
warnings.filterwarnings("ignore", message="Setting `pad_token_id` to `eos_token_id`:None for open-end generation.")
warnings.filterwarnings(
    "ignore", message="Asking to truncate to max_length but no maximum length is provided"
)
warnings.filterwarnings("ignore", message="Default to no truncation.")
# Suppress transformers library messages
transformers_logging.set_verbosity_error()


## Use one of the below methods to login to huggingface
## Option 1:
#os.environ["HF_TOKEN"] = "your_access_token" 
## Option 2:
from huggingface_hub import notebook_login
notebook_login()


# Timing decorator
def timing_decorator(func: Callable) -> Callable:
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        wrapper.last_run_time = elapsed_time
        print(f"{func.__name__} took {elapsed_time:.2f} seconds")
        return result

    wrapper.last_run_time = 0
    return wrapper

# Evaluation functions with timing
@timing_decorator
def evaluate_latency_step(model, tokenizer, prompt, max_tokens, batch_size):
    return evaluate_latency_throughput(model, tokenizer, prompt, max_tokens, batch_size)


@timing_decorator
def evaluate_power_step(model, tokenizer, prompt, max_tokens, batch_size):
    return evaluate_power_efficiency(model, tokenizer, prompt, max_tokens, batch_size)


@timing_decorator
def evaluate_precision_step(model_name, prompt, max_tokens):
    return compare_precision_accuracy(model_name, prompt, max_tokens)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Trying to get this stupid memory evaluation step working

In [2]:
@timing_decorator
def evaluate_memory(model, tokenizer, prompt, sequence_lengths, max_new_tokens=50):
    """
    Evaluate memory usage for varying sequence lengths, with device-agnostic behavior and proper memory measurement.

    Args:
        model: The model to evaluate.
        tokenizer: The tokenizer to use.
        prompt (str): The input prompt for the model.
        sequence_lengths (List[int]): List of sequence lengths to evaluate.
        max_new_tokens (int): The maximum number of tokens to generate.

    Returns:
        dict: Memory usage for each sequence length.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\nOptimized Memory Usage Evaluation (Device: {device.type.upper()}):")

    memory_results = {}

    for length in sequence_lengths:
        print(f"  - Evaluating sequence length: {length}")

        # Truncate or pad the prompt to match the sequence length
        input_text = prompt[:length]
        encoded_input = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=length,
            truncation=True,
            padding="max_length"
        ).to(device)

        try:
            with torch.no_grad():
                if device.type == "cuda":
                    # Clear GPU memory cache
                    torch.cuda.empty_cache()
                    memory_before = torch.cuda.memory_allocated(device)
                    model.generate(
                        input_ids=encoded_input["input_ids"],
                        max_new_tokens=max_new_tokens,
                        pad_token_id=tokenizer.pad_token_id
                    )
                    memory_after = torch.cuda.memory_allocated(device)
                    memory_usage = (memory_after - memory_before) / (1024 ** 2)  # Convert to MB

                else:
                    # Measure memory on CPU
                    process = psutil.Process()
                    mem_before = process.memory_info().rss
                    model.generate(
                        input_ids=encoded_input["input_ids"],
                        max_new_tokens=max_new_tokens,
                        pad_token_id=tokenizer.pad_token_id
                    )
                    mem_after = process.memory_info().rss
                    memory_usage = (mem_after - mem_before) / (1024 ** 2)  # Convert to MB

                # Validate memory usage
                if memory_usage <= 0:
                    print(f"    Warning: Memory usage for sequence length {length} was negligible.")
                memory_results[length] = memory_usage
                print(f"    Memory usage for sequence length {length}: {memory_usage:.2f} MB")

        except RuntimeError as e:
            print(f"    Failed to evaluate for sequence length {length}: {e}")
            memory_results[length] = "OOM"  # Out of Memory error

    print("Memory evaluation completed.\n")
    return memory_results



# THE ONE BELOW WORKS BUT GIVES 0.0 MB for what's consumed in memory.
# def evaluate_memory_step(model, tokenizer, prompt, sequence_lengths, max_new_tokens=1):
#     """
#     Evaluate memory usage for varying sequence lengths, using `max_new_tokens` for token generation.

#     Args:
#         model: The model to evaluate.
#         tokenizer: The tokenizer to use.
#         prompt (str): The input prompt for the model.
#         sequence_lengths (List[int]): List of sequence lengths to evaluate.
#         max_new_tokens (int): The maximum number of tokens to generate.

#     Returns:
#         dict: Memory usage for each sequence length.
#     """
#     print("\nOptimized Memory Usage Evaluation:")
#     start_time = time.time()
#     memory_results = {}

#     for length in sequence_lengths:
#         print(f"  - Evaluating sequence length: {length}")
        
#         # Truncate or pad the prompt to match the sequence length
#         input_text = prompt[:length]
#         encoded_input = tokenizer(
#             input_text,
#             return_tensors="pt",
#             max_length=length,
#             truncation=True,
#             padding="max_length"
#         ).to(model.device)

#         try:
#             # Measure memory before and after generation
#             with torch.no_grad():
#                 memory_before = torch.cuda.memory_allocated(model.device)
#                 model.generate(
#                     input_ids=encoded_input["input_ids"],
#                     max_new_tokens=max_new_tokens,  # Fixed the issue here
#                     pad_token_id=tokenizer.pad_token_id
#                 )
#                 memory_after = torch.cuda.memory_allocated(model.device)
#                 memory_usage = (memory_after - memory_before) / (1024 ** 2)  # Convert to MB

#             # Store and print results
#             memory_results[length] = memory_usage
#             print(f"    Memory usage for sequence length {length}: {memory_usage:.2f} MB")

#         except RuntimeError as e:
#             print(f"    Failed to evaluate for sequence length {length}: {e}")
#             memory_results[length] = "OOM"  # Out of Memory error
#     end_time = time.time()
#     elapsed_time = end_time - start_time
#     print(f"Memory evaluation completed in {elapsed_time:.2f} seconds.\n")
#     return {"memory_results": memory_results, "elapsed_time": elapsed_time}












# @timing_decorator
# def evaluate_memory_step(model, tokenizer, prompt, sequence_lengths=[128], max_tokens=1):
#     """
#     Evaluate memory usage for varying sequence lengths, optimized for performance and clear output.

#     Args:
#         model: The model to evaluate.
#         tokenizer: The tokenizer to use.
#         prompt (str): The input prompt for the model.
#         sequence_lengths (List[int]): List of sequence lengths to evaluate.
#         max_tokens (int): The maximum number of tokens to generate (default: 1).

#     Returns:
#         dict: Memory usage for each sequence length.
#     """
#     print("\nOptimized Memory Usage Evaluation:")
#     memory_results = {}

#     for length in sequence_lengths:
#         print(f"  - Evaluating sequence length: {length}")
        
#         # Truncate or pad the prompt to match the sequence length
#         input_text = prompt[:length]
#         encoded_input = tokenizer(
#             input_text,
#             return_tensors="pt",
#             max_length=length,
#             truncation=True,
#             padding="max_length"
#         ).to(model.device)

#         try:
#             # Measure memory before and after generation
#             with torch.no_grad():
#                 memory_before = torch.cuda.memory_allocated(model.device)
#                 model.generate(input_ids=encoded_input["input_ids"], max_length=max_tokens)
#                 memory_after = torch.cuda.memory_allocated(model.device)
#                 memory_usage = (memory_after - memory_before) / (1024 ** 2)  # Convert to MB

#             # Store and print results
#             memory_results[length] = memory_usage
#             print(f"    Memory usage for sequence length {length}: {memory_usage:.2f} MB")

#         except RuntimeError as e:
#             print(f"    Failed to evaluate for sequence length {length}: {e}")
#             memory_results[length] = "OOM"  # Out of Memory error

#     print("Memory evaluation completed.\n")
#     return memory_results


# def evaluate_memory_step(model, tokenizer, prompt, max_tokens, sequence_lengths, verbose=True):
#     memory_results = {}
#     for length in sequence_lengths:
#         if verbose:
#             print(f"  - Evaluating sequence length: {length}")
#         memory_usage = memory_by_sequence_length(model, tokenizer, prompt, max_tokens, length)
#         memory_results[f"sequence_length_{length}"] = memory_usage
#         if verbose:
#             print(f"    Memory usage for {length} tokens: {memory_usage} MB")
#     return memory_results


In [3]:
def evaluate_brevity(model, tokenizer, reference, prompt, max_new_tokens=50, verbose=False):
    """
    Evaluate brevity score for the generated text.

    Args:
        model: The model to evaluate.
        tokenizer: The tokenizer to use.
        reference (str): The reference text.
        prompt (str): The input prompt for the model.
        max_new_tokens (int): The maximum number of tokens to generate.

    Returns:
        float: The brevity score (0 <= BP <= 1).
    """
    # Encode the prompt
    encoded_input = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(model.device)

    # Generate output
    outputs = model.generate(
        input_ids=encoded_input["input_ids"],
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id
    )

    # Decode the generated output
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Calculate lengths
    reference_length = len(reference.split())
    generated_length = len(generated_text.split())

    # Calculate brevity penalty
    if generated_length == 0:
        print("Warning: Generated text is empty. Returning BP = 0.")
        return 0.0
    elif generated_length >= reference_length:
        brevity_penalty = 1.0  # No penalty for long outputs
    else:
        brevity_penalty = math.exp(1 - (reference_length / generated_length))  # Short output penalty
    
    if verbose:
        return brevity_penalty, generated_text
    else:
        return brevity_penalty

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import get_device

device = "cpu" #get_device()
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
model = AutoModelForCausalLM.from_pretrained(model_name)
model = model.to(device)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '</s>' or '<|endoftext|>'})

In [17]:
prompt = "Write the continuation of this text in fluent English:\n\n" + "It was the best of times, it was the"

In [21]:
%%time
encoded_input = tokenizer(prompt,
    return_tensors="pt",
    max_length=128,
    truncation=True,
    padding="max_length"
).to(model.device)

outputs = model.generate(
    input_ids=encoded_input["input_ids"],
    max_new_tokens=50,
    pad_token_id=tokenizer.pad_token_id
)
tokenizer.decode(outputs[0], skip_special_tokens=True)

CPU times: user 57.6 s, sys: 2.39 s, total: 60 s
Wall time: 56.5 s


'Write the continuation of this text in fluent English:\n\nIt was the best of times, it was the worst of times. It was the age of wisdom, it was the age of foolishness. It was the epoch of belief, it was the epoch of incredulity. It was the season of Light, it was the season of Darkness,'

In [None]:
reference_text = (
    "It was the best of times, it was the worst of times, it was the age of wisdom, "
    "it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, "
    "it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair."
)

# Evaluate brevity score
brevity_score, gen_text = evaluate_brevity(model, tokenizer, reference_text, prompt, verbose=True)
print(f"Brevity Score: {brevity_score:.4f}")

In [None]:
(
    "It was the best of times, it was the worst of times, it was the age of wisdom, "
    "it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, "
    "it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair."
)

In [10]:
tokenizer.pad_token_id

In [7]:
tokenizer.pad_token

'</s>'

In [22]:


# # Correct way to load the model
# # model_name = "mistralai/Mistral-7B-v0.1"
# model_name = "mistralai/Mistral-7B-v0.3"
# # model_name = "meta-llama/Llama-3.2-1B-Instruct"
# # model_name = "meta-llama/Llama-3.2-1B"
# model = AutoModelForCausalLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token or '<|endoftext|>'})

In [7]:
# Reference and prompt
reference_text = (
    "It was the best of times, it was the worst of times, it was the age of wisdom, "
    "it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, "
    "it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair."
)
prompt = "It was the best of times, it was the worst of"

# Evaluate brevity score
brevity_score, gen_text = evaluate_brevity(model, tokenizer, reference_text, prompt, verbose=True)
print(f"Brevity Score: {brevity_score:.4f}")

Brevity Score: 0.2231


In [28]:
prompt = "Write the continuation of this text in fluent English:\n\n" + "It was the best of times, it was the"

In [29]:
encoded_input = tokenizer(prompt,
    return_tensors="pt",
    max_length=128,
    truncation=True,
    padding="max_length"
).to(model.device)

In [30]:
%%time
outputs = model.generate(
    input_ids=encoded_input["input_ids"],
    max_new_tokens=100,
    do_sample=True,
    top_p=0.9,  # Nucleus sampling
    temperature=0.7,  # Adjust creativity
    pad_token_id=tokenizer.pad_token_id
)
tokenizer.decode(outputs[0], skip_special_tokens=True)

CPU times: user 13min 8s, sys: 1min 33s, total: 14min 42s
Wall time: 1min 53s


'Write the continuation of this text in fluent English:\n\nIt was the best of times, it was the most of the. |\n\n 1908 the 1880—1890 гг.\n\n1895 году, по указу стало 12000000.\n\n1900 году, по указу стало 12000000.\n\n1905 году, по указу стало 120000000.\n'

In [27]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Write the continuation of this text in fluent English:\n\nThe quick brown fox jumped over the fence, the first time.\n\n.\n\n.\n.\n\n.\n.\n.\n\n и о.\n.\n\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n.\n'

In [21]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'What did the quick brown fox jump over?\n\nчиследование нашей, 20107 |\n| 15 18 17 19 20 25 29 30 37 42 4'

In [18]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"Complete this quote, 'It was the best of times, it was the worst of'\n\nи в 1858 году, когда вскоре после начала правления на царство уже входил в план единственной епархии, — в 1837 году была"

In [14]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'It was the best of times, it was the worst of times, it was the age of wisdom,  and also,  when they are not, it is also possible to 1852, и в настоящем году на 21-ым. 1852 года на 1852 году.\n'

In [8]:
gen_text

'It was the best of times, it was the worst of       and 1980-х годах.\n\n 1981 году.\n\n## 1982 году.\n\n## 1983 году.\n\n## 1'

### Next ones to make work.

In [3]:
@timing_decorator
def evaluate_metrics_step(probabilities, precision, recall, true_positive, false_positive, false_negative, ranks, relevance_scores, prompt, outputs):
    return {
        "perplexity": calculate_perplexity(probabilities),
        "f1_score": calculate_f1_score(precision, recall),
        "precision_recall": calculate_precision_recall(true_positive, false_positive, false_negative),
        "mrr": calculate_mean_reciprocal_rank(ranks),
        "map": calculate_mean_average_precision(relevance_scores),
        "brevity_score": calculate_brevity_score(prompt, outputs),
        #"gpt_score": calculate_gpt_score(prompt, outputs),
#         "ragas_score": calculate_ragas_score(prompt, outputs),
#         "helm_score": calculate_helm_score(prompt, outputs),
#         "forgetting_rate": calculate_forgetting_rate(prompt, outputs),
    }

# Trying to estimate runtime
def get_estimated_runtime(selected_steps: List[str], average_times: Dict[str, float]) -> float:
    """
    Estimate the total runtime based on selected steps.

    Args:
        selected_steps (List[str]): List of evaluation steps to perform.
        average_times (Dict[str, float]): Average times for each step.

    Returns:
        float: Estimated runtime in seconds.
    """
    return sum(average_times.get(step, 0) for step in selected_steps)

def evaluate_model(
    model_name: str = "mistralai/Mistral-7B-Instruct-v0.3",
    evaluate_latency: bool = True,
    evaluate_power: bool = True,
    evaluate_precision: bool = True,
    evaluate_memory: bool = True,
    evaluate_metrics: bool = True,
    quantized: bool = False,
    prompt: str = "What is the impact of climate change?",
    batch_size: int = 4,
    max_tokens: int = 128,
    sequence_lengths: List[int] = [128],#, 512], #256, 512, 1024],
    relevance_scores: List[List[int]] = [[1, 0, 1, 1, 0]],
    probabilities: List[float] = [0.2, 0.3, 0.1, 0.4],
    ranks: List[int] = [1, 3, 2, 0],
    precision: float = 0.8,
    recall: float = 0.75,
    true_positive: int = 50,
    false_positive: int = 10,
    false_negative: int = 15,
) -> Dict[str, Any]:
    """
    Evaluate an LLM using specified metrics and utilities.
    """
    report = {"model_name": model_name, "evaluation_results": {}, "conditions": {}, "timing": {}}

    # Detect device
    device = get_device()
    print(f"Using device: {device}")

    # Load model and tokenizer
    print(f"Loading model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '</s>' or '<|endoftext|>'})
    model.resize_token_embeddings(len(tokenizer))

    # Hardware and conditions
    model_parameters = sum(p.numel() for p in model.parameters())
    report["conditions"] = {
        "prompt": prompt,
        "batch_size": batch_size,
        "max_tokens": max_tokens,
        "sequence_lengths": sequence_lengths,
        "device": device.type,
        "model_parameters": model_parameters,
        "quantized": quantized,
    }

    # Step execution
    total_start_time = time.time()

    if evaluate_latency:
        print("Evaluating latency...")
        outputs, latency, throughput, token_throughput = evaluate_latency_step(model, tokenizer, prompt, max_tokens, batch_size)
        report["evaluation_results"]["latency_throughput"] = {
            "latency": latency,
            "throughput": throughput,
            "token_throughput": token_throughput,
        }
        report["timing"]["latency"] = evaluate_latency_step.last_run_time

    if evaluate_power and device.type == "cuda":
        print("Evaluating power efficiency...")
        power_consumed, energy_per_token = evaluate_power_step(model, tokenizer, prompt, max_tokens, batch_size)
        report["evaluation_results"]["power_efficiency"] = {
            "power_consumed": power_consumed,
            "energy_per_token": energy_per_token,
        }
        report["timing"]["power_efficiency"] = evaluate_power_step.last_run_time

    if evaluate_precision:
        print("Comparing precision...")
        precision_match = evaluate_precision_step(model_name, prompt, max_tokens)
        report["evaluation_results"]["precision_comparison"] = {"precision_match": precision_match}
        report["timing"]["precision_comparison"] = evaluate_precision_step.last_run_time

    if evaluate_memory:
        print("Evaluating memory usage...")
        memory_eval = evaluate_memory_step(model, tokenizer, prompt, sequence_lengths, max_tokens)
        report["evaluation_results"]["memory_usage"] = memory_eval["memory_results"]
        report["timing"]["memory_evaluation"] = memory_eval["elapsed_time"]

    if evaluate_metrics:
        print("Evaluating metrics...")
        metrics_results = evaluate_metrics_step(
            probabilities, precision, recall, true_positive, false_positive, false_negative, ranks, relevance_scores, prompt, outputs
        )
        report["evaluation_results"]["metrics"] = metrics_results
        report["timing"]["metrics"] = evaluate_metrics_step.last_run_time

    total_end_time = time.time()
    report["timing"]["total_runtime"] = total_end_time - total_start_time

    print(f"Total runtime: {report['timing']['total_runtime']:.2f} seconds")
    return report

In [None]:
average_times = {
    "evaluate_latency": 2.0,
    "evaluate_power": 1.5,
    "evaluate_precision": 1.8,
    "evaluate_memory": 2.2,
    "evaluate_metrics": 2.5,
}

selected_steps = ["evaluate_latency", "evaluate_power", "evaluate_metrics"]
estimated_runtime = get_estimated_runtime(selected_steps, average_times)
print(f"Estimated runtime: {estimated_runtime:.2f} seconds")

##TODO: Setup pipelines for use with gpt2-xl, see https://huggingface.co/openai-community/gpt2-xl
evaluation_report = evaluate_model(
#     model_name="mistralai/Mistral-7B-v0.1",
#     model_name="mistralai/Mistral-7B-v0.2",
    model_name="mistralai/Mistral-7B-Instruct-v0.3",
#     model_name = "deepseek-ai/DeepSeek-V2.5",
#     model_name = "meta-llama/Llama-3.3-70B-Instruct",
    evaluate_latency=True,
    evaluate_power=True,
    evaluate_precision=True,
    evaluate_memory=True,
    evaluate_metrics=False,
    quantized=False,
)
print("\nEvaluation Report:")
print(evaluation_report)

Estimated runtime: 6.00 seconds
Using device: cpu
Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating latency...
Warming up...
Warm-up Time: 173.59s
Measuring latency and throughput...


Estimated runtime: 6.00 seconds
Using device: mps
Loading model: mistralai/Mistral-7B-v0.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating latency...
Warming up...
Warm-up Time: 9.59s
Measuring latency and throughput...
Latency: 7.73s | Throughput: 0.52 responses/sec | Token Throughput: 25.89 tokens/sec
evaluate_latency_step took 17.33 seconds
Comparing precision...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Full Precision (fp32) Output:
What is the impact of climate change?

Climate change is a global phenomenon that is affecting the entire planet. It is caused by the increase in greenhouse gases in the atmosphere, which trap heat and cause the Earth’s temperature to rise. This has led to a number
Mixed Precision (fp16) Output:
What is the impact of climate change?

Climate change is a global phenomenon that is affecting the entire planet. It is caused by the increase in greenhouse gases in the atmosphere, which trap heat and cause the Earth’s temperature to rise. This has led to a number
evaluate_precision_step took 103.05 seconds
Evaluating memory usage...
  - Evaluating sequence length: 128
    Memory usage for 128 tokens: {128: 0.0, 256: 0.0, 512: 0.0} MB
  - Evaluating sequence length: 512
    Memory usage for 512 tokens: {128: 0.0, 256: 0.0, 512: 0.0} MB
evaluate_memory_step took 164.52 seconds
Evaluating metrics...


TypeError: split() missing 1 required positional argument: 'split_size'

Estimated runtime: 6.00 seconds
Using device: mps
Loading model: mistralai/Mistral-7B-v0.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating latency...
Warming up...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Warm-up Time: 9.61s
Measuring latency and throughput...
Latency: 7.70s | Throughput: 0.52 responses/sec | Token Throughput: 25.97 tokens/sec
evaluate_latency_step took 17.32 seconds
Comparing precision...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Full Precision (fp32) Output:
What is the impact of climate change?

Climate change is a global phenomenon that is affecting the entire planet. It is caused by the increase in greenhouse gases in the atmosphere, which trap heat and cause the Earth’s temperature to rise. This has led to a number
Mixed Precision (fp16) Output:
What is the impact of climate change?

Climate change is a global phenomenon that is affecting the entire planet. It is caused by the increase in greenhouse gases in the atmosphere, which trap heat and cause the Earth’s temperature to rise. This has led to a number
evaluate_precision_step took 115.20 seconds
Evaluating memory usage...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

evaluate_memory_step took 241.61 seconds
Evaluating metrics...


ModuleNotFoundError: No module named 'gpt_score'

In [2]:
def evaluate_model(
    model_name: str = "mistralai/Mistral-7B-v0.1",
    evaluate_latency: bool = True,
    evaluate_power: bool = True,
    evaluate_precision: bool = True,
    evaluate_memory: bool = True,
    evaluate_metrics: bool = True,
    quantized: bool = False,
    prompt: str = "What is the impact of climate change?",
    batch_size: int = 4,
    max_tokens: int = 50,
    sequence_lengths: List[int] = [128, 256, 512, 1024],
    relevance_scores: List[List[int]] = [[1, 0, 1, 1, 0]],
    probabilities: List[float] = [0.2, 0.3, 0.1, 0.4],
    ranks: List[int] = [1, 3, 2, 0],
    precision: float = 0.8,
    recall: float = 0.75,
    true_positive: int = 50,
    false_positive: int = 10,
    false_negative: int = 15,
) -> Dict[str, Any]:
    """
    Evaluate an LLM using specified metrics and utilities.

    Args:
        model_name (str): The Hugging Face model name or path.
        evaluate_latency (bool): Whether to evaluate latency and throughput.
        evaluate_power (bool): Whether to evaluate power efficiency.
        evaluate_precision (bool): Whether to compare precision between fp32 and fp16.
        evaluate_memory (bool): Whether to measure memory usage for varying sequence lengths.
        evaluate_metrics (bool): Whether to compute various metrics like perplexity, F1 score, etc.
        quantized (bool): Whether the model is quantized or not.
        prompt (str): The input prompt for the model.
        batch_size (int): Batch size for evaluation.
        max_tokens (int): Maximum tokens to generate.
        sequence_lengths (List[int]): List of sequence lengths for memory evaluation.
        relevance_scores (List[List[int]]): Relevance scores for MAP calculation.
        probabilities (List[float]): Probabilities for perplexity calculation.
        ranks (List[int]): Ranks for MRR calculation.
        precision (float): Precision score for F1 calculation.
        recall (float): Recall score for F1 calculation.
        true_positive (int): True positives for precision/recall calculation.
        false_positive (int): False positives for precision/recall calculation.
        false_negative (int): False negatives for precision/recall calculation.

    Returns:
        Dict[str, Any]: A detailed report of all evaluations performed.

    Example:
        report = evaluate_model("meta-llama/Llama-2-7b-hf", evaluate_latency=True, evaluate_metrics=True)
        print(report)
    """
    report = {"model_name": model_name, "evaluation_results": {}, "conditions": {}}

    # Detect device
    device = get_device()
    print(f"Using device: {device}")

    # Load model and tokenizer
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token or '<|endoftext|>'})
        print(f"Assigned {tokenizer.pad_token} as the `pad_token`.")
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True).to(device)

    # Hardware information
    hardware_info = get_hardware_info()
    model_parameters = sum(p.numel() for p in model.parameters())
    report["conditions"] = {
        "prompt": prompt,
        "batch_size": batch_size,
        "max_tokens": max_tokens,
        "sequence_lengths": sequence_lengths,
        "num_gpus": hardware_info["num_gpus"],
        "gpu_types": hardware_info["gpu_types"],
        "device": hardware_info["device"],
        "model_parameters": model_parameters,
        "quantized": quantized,
    }

    # Latency and throughput evaluation
    if evaluate_latency:
        print("Evaluating latency and throughput...")
        outputs, latency, throughput, token_throughput = evaluate_latency_throughput(
            model, tokenizer, prompt, max_tokens, batch_size
        )
        report["evaluation_results"]["latency_throughput"] = {
            "latency": latency,
            "throughput": throughput,
            "token_throughput": token_throughput,
        }

    # Power efficiency evaluation
    if evaluate_power and device.type == "cuda":
        print("Evaluating power efficiency...")
        power_consumed, energy_per_token = evaluate_power_efficiency(
            model, tokenizer, prompt, max_tokens, batch_size
        )
        report["evaluation_results"]["power_efficiency"] = {
            "power_consumed": power_consumed,
            "energy_per_token": energy_per_token,
        }
    elif evaluate_power:
        print("Power efficiency evaluation is only supported on CUDA devices.")

    # Precision comparison
    if evaluate_precision:
        print("Comparing precision...")
        precision_match = compare_precision_accuracy(model_name, prompt, max_tokens)
        report["evaluation_results"]["precision_comparison"] = {"precision_match": precision_match}

    # Memory evaluation
    if evaluate_memory:
        print("Evaluating memory usage...")
        memory_results = {}
        for length in sequence_lengths:
            memory_usage = memory_by_sequence_length(model, tokenizer, prompt, max_tokens, length)
            memory_results[f"sequence_length_{length}"] = memory_usage
        report["evaluation_results"]["memory_usage"] = memory_results

    # Metrics evaluation
    if evaluate_metrics:
        print("Evaluating metrics...")
        metrics_results = {
            "perplexity": calculate_perplexity(probabilities),
            "f1_score": calculate_f1_score(precision, recall),
            "precision_recall": calculate_precision_recall(true_positive, false_positive, false_negative),
            "mrr": calculate_mean_reciprocal_rank(ranks),
            "map": calculate_mean_average_precision(relevance_scores),
            "gpt_score": calculate_gpt_score(prompt, outputs),
            "ragas_score": calculate_ragas_score(prompt, outputs),
            "helm_score": calculate_helm_score(prompt, outputs),
            "forgetting_rate": calculate_forgetting_rate(prompt, outputs),
            "brevity_score": calculate_brevity_score(prompt, outputs),
        }
        report["evaluation_results"]["metrics"] = metrics_results

    return report

In [3]:
evaluation_report = evaluate_model(
    model_name="mistralai/Mistral-7B-v0.1",
#     model_name="mistralai/Mistral-7B-v0.2",
#     model_name="mistralai/Mistral-7B-v0.3",
    evaluate_latency=True,
    evaluate_power=True,
    evaluate_precision=True,
    evaluate_memory=True,
    evaluate_metrics=True,
    quantized=False,
)
print("\nEvaluation Report:")
print(evaluation_report)

Using device: mps
Loading model: mistralai/Mistral-7B-v0.1
Assigned </s> as the `pad_token`.




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Evaluating latency and throughput...
Warming up...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Warm-up Time: 12.97s
Measuring latency and throughput...
Latency: 7.74s | Throughput: 0.52 responses/sec | Token Throughput: 25.85 tokens/sec
Power efficiency evaluation is only supported on CUDA devices.
Comparing precision...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Full Precision (fp32) Output:
What is the impact of climate change?

Climate change is a global phenomenon that is affecting the entire planet. It is caused by the increase in greenhouse gases in the atmosphere, which trap heat and cause the Earth’s temperature to rise. This has led to a number
Mixed Precision (fp16) Output:
What is the impact of climate change?

Climate change is a global phenomenon that is affecting the entire planet. It is caused by the increase in greenhouse gases in the atmosphere, which trap heat and cause the Earth’s temperature to rise. This has led to a number
Evaluating memory usage...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Evaluating metrics...

Evaluation Report:
{'model_name': 'mistralai/Mistral-7B-v0.1', 'evaluation_results': {'latency_throughput': {'latency': 7.737421035766602, 'throughput': 0.5169681191588007, 'token_throughput': 25.848405957940038}, 'precision_comparison': {'precision_match': True}, 'memory_usage': {'sequence_length_128': {128: 0.0, 256: 0.0, 512: 0.0}, 'sequence_length_256': {128: 0.0, 256: 0.0, 512: 0.0}, 'sequence_length_512': {128: 0.0, 256: 0.0, 512: 0.0}, 'sequence_length_1024': {128: 0.0, 256: 0.0, 512: 0.0, 1024: 0.0}}, 'metrics': {'perplexity': 4.518010018049225, 'f1_score': 0.7741935483870969, 'precision_recall': {'precision': 0.8333333333333334, 'recall': 0.7692307692307693}, 'mrr': 0.4583333333333333, 'map': 0.8055555555555555}}, 'conditions': {'prompt': 'What is the impact of climate change?', 'batch_size': 4, 'max_tokens': 50, 'sequence_lengths': [128, 256, 512, 1024], 'num_gpus': 1, 'gpu_types': ['Apple M1/M2/M3 (Metal Performance Shaders)'], 'device': 'mps', 'model_

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM  # Ensure this import exists

def compare_precision_accuracy(
    model_name: str, prompt: str, max_tokens: int = 50
) -> bool:
    """
    Compare outputs for fp32 and fp16 precision to test output integrity.

    Args:
        model_name (str): The Hugging Face model name or path.
        prompt (str): The input prompt for the model.
        max_tokens (int): The maximum number of tokens to generate.

    Returns:
        bool: Whether the outputs for fp32 and fp16 are identical.
    """
    device = get_device()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Full precision (fp32)
    model_fp32 = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output_fp32 = model_fp32.generate(**inputs, max_new_tokens=max_tokens)
    text_fp32 = tokenizer.decode(output_fp32[0], skip_special_tokens=True)

    # Mixed precision (fp16)
    model_fp16 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
    output_fp16 = model_fp16.generate(**inputs, max_new_tokens=max_tokens)
    text_fp16 = tokenizer.decode(output_fp16[0], skip_special_tokens=True)

    print(f"Full Precision (fp32) Output:\n{text_fp32}")
    print(f"Mixed Precision (fp16) Output:\n{text_fp16}")

    return text_fp32 == text_fp16
Checklist of Fixes
Add the Missing Import: Ensure that the following is present at the top of the script:

python
Copy code
from transformers import AutoTokenizer, AutoModelForCausalLM
Verify get_device Function: Confirm that the get_device() function is defined to return the appropriate device (cuda, mps, or cpu).

Test the Function: Use the following test code to verify that the compare_precision_accuracy function works as intended:

python
Copy code
if __name__ == "__main__":
    model_name = "mistralai/Mistral-7B-v0.1"
    prompt = "What is the impact of climate change?"
    precision_match = compare_precision_accuracy(model_name, prompt)
    print(f"Precision Match: {precision_match}")
Expected Output
For a properly implemented compare_precision_accuracy function, you should see something like:

plaintext
Copy code
Using device: cuda
Full Precision (fp32) Output:
The impact of climate change includes rising temperatures, sea-level rise...
Mixed Precision (fp16) Output:
The impact of climate change includes rising temperatures, sea-level rise...
Precision Match: True
If the outputs for fp32 and fp16 differ, the function will return False.

Let me know if you encounter any additional issues!













ChatGPT can make mistakes. Check important info.
?

Metrics Utils




Run


133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248


In [None]:
"""
utils.py

This module contains utilities for evaluating the performance of large language models (LLMs) using
metrics such as latency, throughput, power consumption, memory usage, and precision comparison.
It is designed for use with Hugging Face's open-source LLMs, including models like Llama 2 7B.

Requirements:
- transformers
- torch
- pynvml

Install dependencies:
    pip install transformers torch pynvml

"""

import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage, nvmlDeviceGetMemoryInfo
from typing import Tuple, List

# Initialize NVIDIA Management Library
nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

# Helper functions
def track_power() -> float:
    """Track GPU power consumption in watts."""
    return nvmlDeviceGetPowerUsage(gpu_handle) / 1000

def track_memory() -> float:
    """Track GPU memory usage in GB."""
    mem_info = nvmlDeviceGetMemoryInfo(gpu_handle)
    return mem_info.used / (1024 ** 3)

def count_model_parameters(model: torch.nn.Module) -> int:
    """Count the total number of parameters in the model."""
    return sum(p.numel() for p in model.parameters())

class LLMPerformanceTester:
    """
    A utility class for evaluating the performance of Hugging Face LLMs.

    Attributes:
        model_name (str): The Hugging Face model name to load.
        tokenizer: The tokenizer associated with the model.
        model: The LLM model loaded from Hugging Face.
        device (str): Device to run the model on ('cuda' or 'cpu').
        parameter_count (int): Number of parameters in the model.
    """

    def __init__(self, model_name: str):
        self.model_name = model_name
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.parameter_count = count_model_parameters(self.model)
        print(f"Model Size: {self.parameter_count:,} parameters")

    def evaluate_latency_throughput(self, prompt: str, max_tokens: int = 50, batch_size: int = 1) -> Tuple[torch.Tensor, float, float, float]:
        """
        Evaluate latency, token throughput, and token processing rate.

        Args:
            prompt (str): Input prompt for the model.
            max_tokens (int): Maximum number of tokens to generate.
            batch_size (int): Number of prompts in a batch.

        Returns:
            Tuple[torch.Tensor, float, float, float]: Generated outputs, latency, throughput, and token throughput.
        """
        inputs = [prompt] * batch_size
        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.device)

        # Warm-up
        print("Warming up...")
        start_warmup = time.time()
        self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)
        end_warmup = time.time()
        warmup_time = end_warmup - start_warmup
        print(f"Warm-up Time: {warmup_time:.2f}s")

        # Measure latency and throughput
        print("Measuring latency and throughput...")
        start_time = time.time()
        outputs = self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)
        end_time = time.time()

        latency = end_time - start_time
        throughput = batch_size / latency
        total_tokens = max_tokens * batch_size
        token_throughput = total_tokens / latency
        print(f"Latency: {latency:.2f}s | Throughput: {throughput:.2f} responses/sec | Token Throughput: {token_throughput:.2f} tokens/sec")
        return outputs, latency, throughput, token_throughput

    def evaluate_power_efficiency(self, prompt: str, max_tokens: int = 50, batch_size: int = 1) -> Tuple[float, float]:
        """
        Evaluate power consumption and efficiency per token.

        Args:
            prompt (str): Input prompt for the model.
            max_tokens (int): Maximum number of tokens to generate.
            batch_size (int): Number of prompts in a batch.

        Returns:
            Tuple[float, float]: Total power consumed and energy per token.
        """
        inputs = [prompt] * batch_size
        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.device)

        # Warm-up
        print("Warming up...")
        self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)

        # Measure power and efficiency
        print("Measuring power efficiency...")
        power_start = track_power()
        start_time = time.time()

        self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)

        end_time = time.time()
        power_end = track_power()

        latency = end_time - start_time
        throughput = batch_size / latency
        power_consumed = (power_end - power_start) * latency
        total_tokens = max_tokens * batch_size
        energy_per_token = power_consumed / total_tokens if total_tokens > 0 else float('inf')

        print(f"Power Consumption: {power_consumed:.2f} W | Energy per Token: {energy_per_token:.4f} W/token")
        return power_consumed, energy_per_token

    def compare_precision_accuracy(self, prompt: str, max_tokens: int = 50) -> bool:
        """
        Compare outputs for fp32 and fp16 precision to test integrity.

        Args:
            prompt (str): Input prompt for the model.
            max_tokens (int): Maximum number of tokens to generate.

        Returns:
            bool: Whether the outputs match between fp32 and fp16 precisions.
        """
        # Full precision (fp32)
        model_fp32 = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        output_fp32 = model_fp32.generate(**inputs, max_new_tokens=max_tokens)
        text_fp32 = self.tokenizer.decode(output_fp32[0], skip_special_tokens=True)

        # Mixed precision (fp16)
        model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float16).to(self.device)
        output_fp16 = model_fp16.generate(**inputs, max_new_tokens=max_tokens)
        text_fp16 = self.tokenizer.decode(output_fp16[0], skip_special_tokens=True)

        print(f"Full Precision (fp32) Output:\n{text_fp32}")
        print(f"Mixed Precision (fp16) Output:\n{text_fp16}")

        similarity = text_fp32 == text_fp16
        print(f"Outputs Match: {similarity}")
        return similarity

    def memory_by_sequence_length(self, base_prompt: str, max_tokens: int = 50, max_length: int = 1024) -> None:
        """
        Evaluate memory usage as sequence length increases.

        Args:
            base_prompt (str): Base string to repeat for increasing sequence length.
            max_tokens (int): Maximum number of tokens to generate.
            max_length (int): Maximum sequence length to test.
        """
        print("Memory Usage by Sequence Length:")
        for seq_length in [128, 256, 512, max_length]:
            prompt = base_prompt * (seq_length // len(base_prompt))
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)

            torch.cuda.reset_peak_memory_stats()
            self.model.generate(**inputs, max_new_tokens=max_tokens)
            peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)  # Convert to MB

            print(f"Sequence Length: {seq_length} | Peak Memory Usage: {peak_memory:.2f} MB")

"""
Usage example:

if __name__ == "__main__":
    model_name = "meta-llama/Llama-2-7b-hf"  # Example model
    prompt = "Explain the impact of climate change on global agriculture."
    base_prompt = "Climate change affects agriculture in multiple ways. "

    tester = LLMPerformanceTester(model_name)

    # Latency and token throughput
    tester.evaluate_latency_throughput(prompt, max_tokens=50, batch_size=4)

    # Power efficiency and energy per token
    tester.evaluate_power_efficiency(prompt, max_tokens=50, batch_size=4)

    # Compare fp32 and fp16 outputs
    tester.compare_precision_accuracy(prompt, max_tokens=50)

    # Memory by sequence length
    tester.memory_by_sequence_length(base_prompt, max_tokens=50, max_length=1024)
"""


In [None]:
"""
metrics.py

This module contains implementations and utilities to evaluate language models (LLMs) based on various
metrics, including BLEU, ROUGE, METEOR, BERTScore, and others.

The metrics are demonstrated with the Hugging Face model "Mistral" as an example.

Requirements:
- transformers
- torch
- datasets
- bert_score
- nltk

Install dependencies:
    pip install transformers torch datasets bert-score nltk

"""

from typing import List, Dict, Tuple
from datasets import load_metric
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from bert_score import score
import numpy as np
import math

# Example Hugging Face model for evaluation
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

def calculate_bleu(predictions: List[str], references: List[List[str]]) -> Dict:
    """
    Calculate BLEU score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[List[str]]): A list of reference texts (ground truth).

    Returns:
        Dict: BLEU score and additional metrics.

    Resource:
        https://github.com/huggingface/evaluate
    """
    bleu_metric = load_metric("bleu")
    bleu_metric.add_batch(predictions=predictions, references=references)
    result = bleu_metric.compute()
    return result

def calculate_rouge(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate ROUGE score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: ROUGE scores.

    Resource:
        https://github.com/huggingface/evaluate
    """
    rouge_metric = load_metric("rouge")
    rouge_metric.add_batch(predictions=predictions, references=references)
    result = rouge_metric.compute()
    return result

def calculate_meteor(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate METEOR score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: METEOR score.

    Resource:
        https://github.com/huggingface/evaluate
    """
    meteor_metric = load_metric("meteor")
    meteor_metric.add_batch(predictions=predictions, references=references)
    result = meteor_metric.compute()
    return result

def calculate_bert_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate BERTScore for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Precision, Recall, and F1 scores.

    Resource:
        https://github.com/Tiiiger/bert_score
    """
    P, R, F1 = score(predictions, references, lang="en", verbose=True)
    return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

def calculate_ragas_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate RAGAS (Retrieval-Augmented Generation Answer Score).

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: RAGAS score.

    Resource:
        https://github.com/explodinggradients/ragas
    """
    from ragas import evaluate
    ragas_result = evaluate(predictions, references)
    return {"ragas_score": ragas_result["score"]}

def calculate_helm_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate HELM (Holistic Evaluation of Language Models) metrics.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: HELM score.

    Resource:
        https://crfm.stanford.edu/helm/latest/
    """
    # Placeholder for actual HELM evaluation framework integration
    helm_score = np.mean([len(pred) / max(len(ref), 1) for pred, ref in zip(predictions, references)])
    return {"helm_score": helm_score}

def calculate_gpt_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate GPT-Score for text similarity.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: GPT-Score values.

    Resource:
        https://github.com/IntelLabs/gpt-score
    """
    from gpt_score import GPTScorer
    scorer = GPTScorer()
    scores = scorer.score(predictions, references)
    return {"gpt_score": np.mean(scores)}

def calculate_forgetting_rate(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompts: List[str]) -> float:
    """
    Calculate Forgetting Rate of the model over repeated evaluations.

    Args:
        model (AutoModelForCausalLM): The language model to test.
        tokenizer (AutoTokenizer): Tokenizer associated with the model.
        prompts (List[str]): List of input prompts.

    Returns:
        float: Forgetting rate as a percentage.

    Resource:
        https://arxiv.org/abs/2205.12647
    """
    baseline_results = []
    repeated_results = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        baseline_output = model.generate(**inputs)
        repeated_output = model.generate(**inputs)

        baseline_results.append(tokenizer.decode(baseline_output[0], skip_special_tokens=True))
        repeated_results.append(tokenizer.decode(repeated_output[0], skip_special_tokens=True))

    differences = [1 if b != r else 0 for b, r in zip(baseline_results, repeated_results)]
    forgetting_rate = sum(differences) / len(prompts) * 100
    return forgetting_rate

def calculate_brevity_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate Brevity Score to evaluate concise text generation.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Brevity score.

    Resource:
        https://arxiv.org/pdf/1904.09675.pdf
    """
    brevity_ratios = [len(pred.split()) / max(len(ref.split()), 1) for pred, ref in zip(predictions, references)]
    brevity_score = np.mean([min(1.0, ratio) for ratio in brevity_ratios])
    return {"brevity_score": brevity_score}

"""
Usage Example with Hugging Face LLM Mistral:

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")

    # Sample inputs
    predictions = ["Climate change is a global challenge that requires..."]
    references = [["Climate change is a pressing issue affecting..."]]

    # BLEU
    print("BLEU Score:", calculate_bleu(predictions, references))

    # ROUGE
    print("ROUGE Score:", calculate_rouge(predictions, [r[0] for r in references]))

    # METEOR
    print("METEOR Score:", calculate_meteor(predictions, [r[0] for r in references]))

    # BERTScore
    print("BERTScore:", calculate_bert_score(predictions, [r[0] for r in references]))

    # RAGAS
    print("RAGAS Score:", calculate_ragas_score(predictions, [r[0] for r in references]))

    # HELM
    print("HELM Score:", calculate_helm_score(predictions, [r[0] for r in references]))

    # GPT-Score
    print("GPT-Score:", calculate_gpt_score(predictions, [r[0] for r in references]))

    # Forgetting Rate
    prompts = ["What is climate change?", "Explain photosynthesis."]
    print("Forgetting Rate:", calculate_forgetting_rate(model, tokenizer, prompts))

    # Brevity Score
    print("Brevity Score:", calculate_brevity_score(predictions, [r[0] for r in references]))
"""


In [None]:
import numpy as np
from typing import List, Dict, Any


def calculate_perplexity(probabilities: List[float]) -> float:
    """
    Calculate the perplexity of a model's output.

    Args:
        probabilities (List[float]): A list of probabilities for each token in the sequence.

    Returns:
        float: The perplexity score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/perplexity
    """
    cross_entropy = -np.mean(np.log(probabilities))
    perplexity = np.exp(cross_entropy)
    return perplexity


def calculate_f1_score(precision: float, recall: float) -> float:
    """
    Calculate the F1 score given precision and recall.

    Args:
        precision (float): Precision of the predictions.
        recall (float): Recall of the predictions.

    Returns:
        float: The F1 score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/f1
    """
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def calculate_precision_recall(true_positive: int, false_positive: int, false_negative: int) -> Dict[str, float]:
    """
    Calculate precision and recall.

    Args:
        true_positive (int): Number of true positive cases.
        false_positive (int): Number of false positive cases.
        false_negative (int): Number of false negative cases.

    Returns:
        Dict[str, float]: A dictionary containing precision and recall scores.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/precision
    """
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
    return {"precision": precision, "recall": recall}


def calculate_mean_reciprocal_rank(ranks: List[int]) -> float:
    """
    Calculate the Mean Reciprocal Rank (MRR).

    Args:
        ranks (List[int]): A list of ranks for the first relevant result in each query.

    Returns:
        float: The MRR score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/mrr
    """
    reciprocal_ranks = [1 / rank if rank > 0 else 0 for rank in ranks]
    return np.mean(reciprocal_ranks)


def calculate_mean_average_precision(relevance_scores: List[List[int]]) -> float:
    """
    Calculate Mean Average Precision (MAP).

    Args:
        relevance_scores (List[List[int]]): A list of binary relevance scores for each query's retrieved documents.

    Returns:
        float: The MAP score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/map
    """
    average_precisions = []
    for scores in relevance_scores:
        precision_at_k = [
            sum(scores[:k + 1]) / (k + 1) for k in range(len(scores)) if scores[k] == 1
        ]
        if precision_at_k:
            average_precisions.append(np.mean(precision_at_k))
    return np.mean(average_precisions) if average_precisions else 0.0


## Appendix:

In [None]:
"""
metrics.py

This module contains implementations and utilities to evaluate language models (LLMs) based on various
metrics, including BLEU, ROUGE, METEOR, BERTScore, and others.

The metrics are demonstrated with the Hugging Face model "Mistral" as an example.

Requirements:
- transformers
- torch
- datasets
- bert_score
- nltk

Install dependencies:
    pip install transformers torch datasets bert-score nltk

"""

from typing import List, Dict, Tuple
from datasets import load_metric
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from bert_score import score

# Example Hugging Face model for evaluation
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

def calculate_bleu(predictions: List[str], references: List[List[str]]) -> Dict:
    """
    Calculate BLEU score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[List[str]]): A list of reference texts (ground truth).

    Returns:
        Dict: BLEU score and additional metrics.
    """
    bleu_metric = load_metric("bleu")
    bleu_metric.add_batch(predictions=predictions, references=references)
    result = bleu_metric.compute()
    return result

def calculate_rouge(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate ROUGE score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: ROUGE scores.
    """
    rouge_metric = load_metric("rouge")
    rouge_metric.add_batch(predictions=predictions, references=references)
    result = rouge_metric.compute()
    return result

def calculate_meteor(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate METEOR score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: METEOR score.
    """
    meteor_metric = load_metric("meteor")
    meteor_metric.add_batch(predictions=predictions, references=references)
    result = meteor_metric.compute()
    return result

def calculate_bert_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate BERTScore for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Precision, Recall, and F1 scores.
    """
    P, R, F1 = score(predictions, references, lang="en", verbose=True)
    return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

def calculate_ragas_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function to calculate RAGAS (Retrieval-Augmented Generation Answer Score).

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder RAGAS scores.
    """
    # Implementation will depend on specific retrieval-augmented scoring methods
    return {"ragas_score": 0.85}  # Placeholder value

def calculate_helm_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function to calculate HELM (Holistic Evaluation of Language Models) metrics.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder HELM scores.
    """
    # Implementation requires HELM evaluation framework
    return {"helm_score": 0.9}  # Placeholder value

def calculate_gpt_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function to calculate GPT-Score.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder GPT-Score.
    """
    return {"gpt_score": 0.88}  # Placeholder value

def scenario_fidelity_tests(predictions: List[str], scenarios: List[str]) -> Dict:
    """
    Placeholder function to evaluate Scenario Fidelity Tests.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        scenarios (List[str]): A list of test scenarios.

    Returns:
        Dict: Placeholder scenario fidelity results.
    """
    return {"scenario_fidelity": 0.92}  # Placeholder value

def calculate_forgetting_rate(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompts: List[str]) -> float:
    """
    Placeholder function to calculate Forgetting Rate of the model over time.

    Args:
        model (AutoModelForCausalLM): The language model to test.
        tokenizer (AutoTokenizer): Tokenizer associated with the model.
        prompts (List[str]): List of input prompts.

    Returns:
        float: Placeholder forgetting rate.
    """
    return 0.05  # Placeholder value

def rimu_evaluation(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function for RIMU (Relevance, Integration, Memory, Usefulness) evaluation.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder RIMU evaluation results.
    """
    return {"rimu_score": 0.87}  # Placeholder value

def problem_solving_effectiveness(predictions: List[str], problems: List[str]) -> float:
    """
    Placeholder function to calculate problem-solving effectiveness of the model.

    Args:
        predictions (List[str]): A list of predicted solutions from the model.
        problems (List[str]): A list of problems to solve.

    Returns:
        float: Placeholder effectiveness score.
    """
    return 0.91  # Placeholder value

"""
Usage Example with Hugging Face LLM Mistral:

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")

    # Sample inputs
    predictions = ["Climate change is a global challenge that requires..."]
    references = [["Climate change is a pressing issue affecting..."]]

    # BLEU
    print("BLEU Score:", calculate_bleu(predictions, references))

    # ROUGE
    print("ROUGE Score:", calculate_rouge(predictions, [r[0] for r in references]))

    # METEOR
    print("METEOR Score:", calculate_meteor(predictions, [r[0] for r in references]))

    # BERTScore
    print("BERTScore:", calculate_bert_score(predictions, [r[0] for r in references]))

    # RAGAS
    print("RAGAS Score:", calculate_ragas_score(predictions, [r[0] for r in references]))

    # HELM
    print("HELM Score:", calculate_helm_score(predictions, [r[0] for r in references]))

    # GPT-Score
    print("GPT-Score:", calculate_gpt_score(predictions, [r[0] for r in references]))

    # Scenario Fidelity
    print("Scenario Fidelity:", scenario_fidelity_tests(predictions, ["scenario example"]))
"""
