## Sandbox for LLM Metrics Scripts

## report.py with graph outputs

In [None]:
import time
import matplotlib.pyplot as plt
from typing import Dict, Any, List
from transformers import AutoTokenizer, AutoModelForCausalLM
from utils import (
    evaluate_latency_throughput,
    evaluate_power_efficiency,
    compare_precision_accuracy,
    memory_by_sequence_length,
)
from metrics import (
    calculate_perplexity,
    calculate_f1_score,
    calculate_precision_recall,
    calculate_mean_reciprocal_rank,
    calculate_mean_average_precision,
)
from pynvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetName


def get_hardware_info() -> dict:
    """
    Gather hardware details about the system used for evaluation.

    Returns:
        dict: A dictionary containing the number of GPUs, GPU types, and whether CUDA is available.
    """
    nvmlInit()
    gpu_count = nvmlDeviceGetCount()
    gpu_info = [nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)).decode() for i in range(gpu_count)]
    return {
        "num_gpus": gpu_count,
        "gpu_types": gpu_info,
        "cuda_available": torch.cuda.is_available(),
    }


def generate_graphs(report: Dict[str, Any]) -> None:
    """
    Generate graphs for the evaluation results.

    Args:
        report (Dict[str, Any]): The evaluation report containing metrics and results.
    """
    # Latency and throughput
    if "latency_throughput" in report["evaluation_results"]:
        results = report["evaluation_results"]["latency_throughput"]
        plt.figure(figsize=(8, 6))
        plt.bar(["Latency", "Throughput", "Token Throughput"], [results["latency"], results["throughput"], results["token_throughput"]])
        plt.title("Latency and Throughput")
        plt.ylabel("Time/Throughput")
        plt.savefig("latency_throughput.png")
        plt.show()

    # Power efficiency
    if "power_efficiency" in report["evaluation_results"]:
        results = report["evaluation_results"]["power_efficiency"]
        plt.figure(figsize=(8, 6))
        plt.bar(["Power Consumed", "Energy per Token"], [results["power_consumed"], results["energy_per_token"]])
        plt.title("Power Efficiency")
        plt.ylabel("Watts/Energy")
        plt.savefig("power_efficiency.png")
        plt.show()

    # Memory usage by sequence length
    if "memory_usage" in report["evaluation_results"]:
        memory_results = report["evaluation_results"]["memory_usage"]
        sequence_lengths = list(memory_results.keys())
        memory_usages = list(memory_results.values())

        plt.figure(figsize=(10, 6))
        plt.plot(sequence_lengths, memory_usages, marker="o", linestyle="-", color="b")
        plt.title("Memory Usage by Sequence Length")
        plt.xlabel("Sequence Length")
        plt.ylabel("Memory Usage (MB)")
        plt.grid()
        plt.savefig("memory_usage.png")
        plt.show()

    # Metrics summary
    if "metrics" in report["evaluation_results"]:
        metrics = report["evaluation_results"]["metrics"]
        names = list(metrics.keys())
        values = [metrics[name] if isinstance(metrics[name], (int, float)) else 0 for name in names]

        plt.figure(figsize=(10, 6))
        plt.bar(names, values, color="g")
        plt.title("Metrics Summary")
        plt.ylabel("Scores")
        plt.xticks(rotation=45, ha="right")
        plt.savefig("metrics_summary.png")
        plt.show()


def evaluate_model(
    model_name: str,
    evaluate_latency: bool = True,
    evaluate_power: bool = True,
    evaluate_precision: bool = True,
    evaluate_memory: bool = True,
    evaluate_metrics: bool = True,
    quantized: bool = False,
    prompt: str = "What is the impact of climate change?",
    batch_size: int = 4,
    max_tokens: int = 50,
    sequence_lengths: List[int] = [128, 256, 512, 1024],
    relevance_scores: List[List[int]] = [[1, 0, 1, 1, 0]],
    probabilities: List[float] = [0.2, 0.3, 0.1, 0.4],
    ranks: List[int] = [1, 3, 2, 0],
    precision: float = 0.8,
    recall: float = 0.75,
    true_positive: int = 50,
    false_positive: int = 10,
    false_negative: int = 15,
) -> Dict[str, Any]:
    """
    Evaluate an LLM using specified metrics and utilities.

    Args:
        model_name (str): The Hugging Face model name or path.
        evaluate_latency (bool): Whether to evaluate latency and throughput.
        evaluate_power (bool): Whether to evaluate power efficiency.
        evaluate_precision (bool): Whether to compare precision between fp32 and fp16.
        evaluate_memory (bool): Whether to measure memory usage for varying sequence lengths.
        evaluate_metrics (bool): Whether to compute various metrics like perplexity, F1 score, etc.
        quantized (bool): Whether the model is quantized or not.
        prompt (str): The input prompt for the model.
        batch_size (int): Batch size for evaluation.
        max_tokens (int): Maximum tokens to generate.
        sequence_lengths (List[int]): List of sequence lengths for memory evaluation.
        relevance_scores (List[List[int]]): Relevance scores for MAP calculation.
        probabilities (List[float]): Probabilities for perplexity calculation.
        ranks (List[int]): Ranks for MRR calculation.
        precision (float): Precision score for F1 calculation.
        recall (float): Recall score for F1 calculation.
        true_positive (int): True positives for precision/recall calculation.
        false_positive (int): False positives for precision/recall calculation.
        false_negative (int): False negatives for precision/recall calculation.

    Returns:
        Dict[str, Any]: A detailed report of all evaluations performed.
    """
    report = {"model_name": model_name, "evaluation_results": {}, "conditions": {}}

    # Load model and tokenizer
    print(f"Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

    # Hardware information
    hardware_info = get_hardware_info()
    model_parameters = sum(p.numel() for p in model.parameters())
    report["conditions"] = {
        "prompt": prompt,
        "batch_size": batch_size,
        "max_tokens": max_tokens,
        "sequence_lengths": sequence_lengths,
        "num_gpus": hardware_info["num_gpus"],
        "gpu_types": hardware_info["gpu_types"],
        "cuda_available": hardware_info["cuda_available"],
        "model_parameters": model_parameters,
        "quantized": quantized,
    }

    # Perform evaluations
    if evaluate_latency:
        outputs, latency, throughput, token_throughput = evaluate_latency_throughput(
            model, tokenizer, prompt, max_tokens, batch_size
        )
        report["evaluation_results"]["latency_throughput"] = {
            "latency": latency,
            "throughput": throughput,
            "token_throughput": token_throughput,
        }

    if evaluate_power:
        power_consumed, energy_per_token = evaluate_power_efficiency(
            model, tokenizer, prompt, max_tokens, batch_size
        )
        report["evaluation_results"]["power_efficiency"] = {
            "power_consumed": power_consumed,
            "energy_per_token": energy_per_token,
        }

    if evaluate_precision:
        precision_match = compare_precision_accuracy(model_name, prompt, max_tokens)
        report["evaluation_results"]["precision_comparison"] = {"precision_match": precision_match}

    if evaluate_memory:
        memory_results = {}
        for length in sequence_lengths:
            memory_usage = memory_by_sequence_length(model, tokenizer, prompt, max_tokens, length)
            memory_results[f"sequence_length_{length}"] = memory_usage
        report["evaluation_results"]["memory_usage"] = memory_results

    if evaluate_metrics:
        metrics_results = {
            "perplexity": calculate_perplexity(probabilities),
            "f1_score": calculate_f1_score(precision, recall),
            "precision_recall": calculate_precision_recall(true_positive, false_positive, false_negative),
            "mrr": calculate_mean_reciprocal_rank(ranks),
            "map": calculate_mean_average_precision(relevance_scores),
        }
        report["evaluation_results"]["metrics"] = metrics_results

    # Generate graphs for the report
    generate_graphs(report)

    return report


# Example usage
if __name__ == "__main__":
    model_name = "meta-llama/Llama-2-7b-hf"
    evaluation_report = evaluate_model(
        model_name,
        evaluate_latency=True,
        evaluate_power=True,
        evaluate_precision=True,
        evaluate_memory=True,
        evaluate_metrics=True,
        quantized=False,
    )
    print("\nEvaluation Report:")
    print(evaluation_report)


In [None]:
"""
utils.py

This module contains utilities for evaluating the performance of large language models (LLMs) using
metrics such as latency, throughput, power consumption, memory usage, and precision comparison.
It is designed for use with Hugging Face's open-source LLMs, including models like Llama 2 7B.

Requirements:
- transformers
- torch
- pynvml

Install dependencies:
    pip install transformers torch pynvml

"""

import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage, nvmlDeviceGetMemoryInfo
from typing import Tuple, List

# Initialize NVIDIA Management Library
nvmlInit()
gpu_handle = nvmlDeviceGetHandleByIndex(0)

# Helper functions
def track_power() -> float:
    """Track GPU power consumption in watts."""
    return nvmlDeviceGetPowerUsage(gpu_handle) / 1000

def track_memory() -> float:
    """Track GPU memory usage in GB."""
    mem_info = nvmlDeviceGetMemoryInfo(gpu_handle)
    return mem_info.used / (1024 ** 3)

def count_model_parameters(model: torch.nn.Module) -> int:
    """Count the total number of parameters in the model."""
    return sum(p.numel() for p in model.parameters())

class LLMPerformanceTester:
    """
    A utility class for evaluating the performance of Hugging Face LLMs.

    Attributes:
        model_name (str): The Hugging Face model name to load.
        tokenizer: The tokenizer associated with the model.
        model: The LLM model loaded from Hugging Face.
        device (str): Device to run the model on ('cuda' or 'cpu').
        parameter_count (int): Number of parameters in the model.
    """

    def __init__(self, model_name: str):
        self.model_name = model_name
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.parameter_count = count_model_parameters(self.model)
        print(f"Model Size: {self.parameter_count:,} parameters")

    def evaluate_latency_throughput(self, prompt: str, max_tokens: int = 50, batch_size: int = 1) -> Tuple[torch.Tensor, float, float, float]:
        """
        Evaluate latency, token throughput, and token processing rate.

        Args:
            prompt (str): Input prompt for the model.
            max_tokens (int): Maximum number of tokens to generate.
            batch_size (int): Number of prompts in a batch.

        Returns:
            Tuple[torch.Tensor, float, float, float]: Generated outputs, latency, throughput, and token throughput.
        """
        inputs = [prompt] * batch_size
        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.device)

        # Warm-up
        print("Warming up...")
        start_warmup = time.time()
        self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)
        end_warmup = time.time()
        warmup_time = end_warmup - start_warmup
        print(f"Warm-up Time: {warmup_time:.2f}s")

        # Measure latency and throughput
        print("Measuring latency and throughput...")
        start_time = time.time()
        outputs = self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)
        end_time = time.time()

        latency = end_time - start_time
        throughput = batch_size / latency
        total_tokens = max_tokens * batch_size
        token_throughput = total_tokens / latency
        print(f"Latency: {latency:.2f}s | Throughput: {throughput:.2f} responses/sec | Token Throughput: {token_throughput:.2f} tokens/sec")
        return outputs, latency, throughput, token_throughput

    def evaluate_power_efficiency(self, prompt: str, max_tokens: int = 50, batch_size: int = 1) -> Tuple[float, float]:
        """
        Evaluate power consumption and efficiency per token.

        Args:
            prompt (str): Input prompt for the model.
            max_tokens (int): Maximum number of tokens to generate.
            batch_size (int): Number of prompts in a batch.

        Returns:
            Tuple[float, float]: Total power consumed and energy per token.
        """
        inputs = [prompt] * batch_size
        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to(self.device)

        # Warm-up
        print("Warming up...")
        self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)

        # Measure power and efficiency
        print("Measuring power efficiency...")
        power_start = track_power()
        start_time = time.time()

        self.model.generate(**tokenized_inputs, max_new_tokens=max_tokens)

        end_time = time.time()
        power_end = track_power()

        latency = end_time - start_time
        throughput = batch_size / latency
        power_consumed = (power_end - power_start) * latency
        total_tokens = max_tokens * batch_size
        energy_per_token = power_consumed / total_tokens if total_tokens > 0 else float('inf')

        print(f"Power Consumption: {power_consumed:.2f} W | Energy per Token: {energy_per_token:.4f} W/token")
        return power_consumed, energy_per_token

    def compare_precision_accuracy(self, prompt: str, max_tokens: int = 50) -> bool:
        """
        Compare outputs for fp32 and fp16 precision to test integrity.

        Args:
            prompt (str): Input prompt for the model.
            max_tokens (int): Maximum number of tokens to generate.

        Returns:
            bool: Whether the outputs match between fp32 and fp16 precisions.
        """
        # Full precision (fp32)
        model_fp32 = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        output_fp32 = model_fp32.generate(**inputs, max_new_tokens=max_tokens)
        text_fp32 = self.tokenizer.decode(output_fp32[0], skip_special_tokens=True)

        # Mixed precision (fp16)
        model_fp16 = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype=torch.float16).to(self.device)
        output_fp16 = model_fp16.generate(**inputs, max_new_tokens=max_tokens)
        text_fp16 = self.tokenizer.decode(output_fp16[0], skip_special_tokens=True)

        print(f"Full Precision (fp32) Output:\n{text_fp32}")
        print(f"Mixed Precision (fp16) Output:\n{text_fp16}")

        similarity = text_fp32 == text_fp16
        print(f"Outputs Match: {similarity}")
        return similarity

    def memory_by_sequence_length(self, base_prompt: str, max_tokens: int = 50, max_length: int = 1024) -> None:
        """
        Evaluate memory usage as sequence length increases.

        Args:
            base_prompt (str): Base string to repeat for increasing sequence length.
            max_tokens (int): Maximum number of tokens to generate.
            max_length (int): Maximum sequence length to test.
        """
        print("Memory Usage by Sequence Length:")
        for seq_length in [128, 256, 512, max_length]:
            prompt = base_prompt * (seq_length // len(base_prompt))
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True).to(self.device)

            torch.cuda.reset_peak_memory_stats()
            self.model.generate(**inputs, max_new_tokens=max_tokens)
            peak_memory = torch.cuda.max_memory_allocated() / (1024 ** 2)  # Convert to MB

            print(f"Sequence Length: {seq_length} | Peak Memory Usage: {peak_memory:.2f} MB")

"""
Usage example:

if __name__ == "__main__":
    model_name = "meta-llama/Llama-2-7b-hf"  # Example model
    prompt = "Explain the impact of climate change on global agriculture."
    base_prompt = "Climate change affects agriculture in multiple ways. "

    tester = LLMPerformanceTester(model_name)

    # Latency and token throughput
    tester.evaluate_latency_throughput(prompt, max_tokens=50, batch_size=4)

    # Power efficiency and energy per token
    tester.evaluate_power_efficiency(prompt, max_tokens=50, batch_size=4)

    # Compare fp32 and fp16 outputs
    tester.compare_precision_accuracy(prompt, max_tokens=50)

    # Memory by sequence length
    tester.memory_by_sequence_length(base_prompt, max_tokens=50, max_length=1024)
"""


In [None]:
"""
metrics.py

This module contains implementations and utilities to evaluate language models (LLMs) based on various
metrics, including BLEU, ROUGE, METEOR, BERTScore, and others.

The metrics are demonstrated with the Hugging Face model "Mistral" as an example.

Requirements:
- transformers
- torch
- datasets
- bert_score
- nltk

Install dependencies:
    pip install transformers torch datasets bert-score nltk

"""

from typing import List, Dict, Tuple
from datasets import load_metric
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from bert_score import score
import numpy as np
import math

# Example Hugging Face model for evaluation
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

def calculate_bleu(predictions: List[str], references: List[List[str]]) -> Dict:
    """
    Calculate BLEU score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[List[str]]): A list of reference texts (ground truth).

    Returns:
        Dict: BLEU score and additional metrics.

    Resource:
        https://github.com/huggingface/evaluate
    """
    bleu_metric = load_metric("bleu")
    bleu_metric.add_batch(predictions=predictions, references=references)
    result = bleu_metric.compute()
    return result

def calculate_rouge(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate ROUGE score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: ROUGE scores.

    Resource:
        https://github.com/huggingface/evaluate
    """
    rouge_metric = load_metric("rouge")
    rouge_metric.add_batch(predictions=predictions, references=references)
    result = rouge_metric.compute()
    return result

def calculate_meteor(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate METEOR score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: METEOR score.

    Resource:
        https://github.com/huggingface/evaluate
    """
    meteor_metric = load_metric("meteor")
    meteor_metric.add_batch(predictions=predictions, references=references)
    result = meteor_metric.compute()
    return result

def calculate_bert_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate BERTScore for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Precision, Recall, and F1 scores.

    Resource:
        https://github.com/Tiiiger/bert_score
    """
    P, R, F1 = score(predictions, references, lang="en", verbose=True)
    return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

def calculate_ragas_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate RAGAS (Retrieval-Augmented Generation Answer Score).

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: RAGAS score.

    Resource:
        https://github.com/explodinggradients/ragas
    """
    from ragas import evaluate
    ragas_result = evaluate(predictions, references)
    return {"ragas_score": ragas_result["score"]}

def calculate_helm_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate HELM (Holistic Evaluation of Language Models) metrics.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: HELM score.

    Resource:
        https://crfm.stanford.edu/helm/latest/
    """
    # Placeholder for actual HELM evaluation framework integration
    helm_score = np.mean([len(pred) / max(len(ref), 1) for pred, ref in zip(predictions, references)])
    return {"helm_score": helm_score}

def calculate_gpt_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate GPT-Score for text similarity.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: GPT-Score values.

    Resource:
        https://github.com/IntelLabs/gpt-score
    """
    from gpt_score import GPTScorer
    scorer = GPTScorer()
    scores = scorer.score(predictions, references)
    return {"gpt_score": np.mean(scores)}

def calculate_forgetting_rate(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompts: List[str]) -> float:
    """
    Calculate Forgetting Rate of the model over repeated evaluations.

    Args:
        model (AutoModelForCausalLM): The language model to test.
        tokenizer (AutoTokenizer): Tokenizer associated with the model.
        prompts (List[str]): List of input prompts.

    Returns:
        float: Forgetting rate as a percentage.

    Resource:
        https://arxiv.org/abs/2205.12647
    """
    baseline_results = []
    repeated_results = []

    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        baseline_output = model.generate(**inputs)
        repeated_output = model.generate(**inputs)

        baseline_results.append(tokenizer.decode(baseline_output[0], skip_special_tokens=True))
        repeated_results.append(tokenizer.decode(repeated_output[0], skip_special_tokens=True))

    differences = [1 if b != r else 0 for b, r in zip(baseline_results, repeated_results)]
    forgetting_rate = sum(differences) / len(prompts) * 100
    return forgetting_rate

def calculate_brevity_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate Brevity Score to evaluate concise text generation.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Brevity score.

    Resource:
        https://arxiv.org/pdf/1904.09675.pdf
    """
    brevity_ratios = [len(pred.split()) / max(len(ref.split()), 1) for pred, ref in zip(predictions, references)]
    brevity_score = np.mean([min(1.0, ratio) for ratio in brevity_ratios])
    return {"brevity_score": brevity_score}

"""
Usage Example with Hugging Face LLM Mistral:

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")

    # Sample inputs
    predictions = ["Climate change is a global challenge that requires..."]
    references = [["Climate change is a pressing issue affecting..."]]

    # BLEU
    print("BLEU Score:", calculate_bleu(predictions, references))

    # ROUGE
    print("ROUGE Score:", calculate_rouge(predictions, [r[0] for r in references]))

    # METEOR
    print("METEOR Score:", calculate_meteor(predictions, [r[0] for r in references]))

    # BERTScore
    print("BERTScore:", calculate_bert_score(predictions, [r[0] for r in references]))

    # RAGAS
    print("RAGAS Score:", calculate_ragas_score(predictions, [r[0] for r in references]))

    # HELM
    print("HELM Score:", calculate_helm_score(predictions, [r[0] for r in references]))

    # GPT-Score
    print("GPT-Score:", calculate_gpt_score(predictions, [r[0] for r in references]))

    # Forgetting Rate
    prompts = ["What is climate change?", "Explain photosynthesis."]
    print("Forgetting Rate:", calculate_forgetting_rate(model, tokenizer, prompts))

    # Brevity Score
    print("Brevity Score:", calculate_brevity_score(predictions, [r[0] for r in references]))
"""


In [None]:
import numpy as np
from typing import List, Dict, Any


def calculate_perplexity(probabilities: List[float]) -> float:
    """
    Calculate the perplexity of a model's output.

    Args:
        probabilities (List[float]): A list of probabilities for each token in the sequence.

    Returns:
        float: The perplexity score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/perplexity
    """
    cross_entropy = -np.mean(np.log(probabilities))
    perplexity = np.exp(cross_entropy)
    return perplexity


def calculate_f1_score(precision: float, recall: float) -> float:
    """
    Calculate the F1 score given precision and recall.

    Args:
        precision (float): Precision of the predictions.
        recall (float): Recall of the predictions.

    Returns:
        float: The F1 score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/f1
    """
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def calculate_precision_recall(true_positive: int, false_positive: int, false_negative: int) -> Dict[str, float]:
    """
    Calculate precision and recall.

    Args:
        true_positive (int): Number of true positive cases.
        false_positive (int): Number of false positive cases.
        false_negative (int): Number of false negative cases.

    Returns:
        Dict[str, float]: A dictionary containing precision and recall scores.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/precision
    """
    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0.0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0.0
    return {"precision": precision, "recall": recall}


def calculate_mean_reciprocal_rank(ranks: List[int]) -> float:
    """
    Calculate the Mean Reciprocal Rank (MRR).

    Args:
        ranks (List[int]): A list of ranks for the first relevant result in each query.

    Returns:
        float: The MRR score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/mrr
    """
    reciprocal_ranks = [1 / rank if rank > 0 else 0 for rank in ranks]
    return np.mean(reciprocal_ranks)


def calculate_mean_average_precision(relevance_scores: List[List[int]]) -> float:
    """
    Calculate Mean Average Precision (MAP).

    Args:
        relevance_scores (List[List[int]]): A list of binary relevance scores for each query's retrieved documents.

    Returns:
        float: The MAP score.

    Reference:
        - https://huggingface.co/docs/evaluate/metrics/map
    """
    average_precisions = []
    for scores in relevance_scores:
        precision_at_k = [
            sum(scores[:k + 1]) / (k + 1) for k in range(len(scores)) if scores[k] == 1
        ]
        if precision_at_k:
            average_precisions.append(np.mean(precision_at_k))
    return np.mean(average_precisions) if average_precisions else 0.0


## Appendix:

In [None]:
"""
metrics.py

This module contains implementations and utilities to evaluate language models (LLMs) based on various
metrics, including BLEU, ROUGE, METEOR, BERTScore, and others.

The metrics are demonstrated with the Hugging Face model "Mistral" as an example.

Requirements:
- transformers
- torch
- datasets
- bert_score
- nltk

Install dependencies:
    pip install transformers torch datasets bert-score nltk

"""

from typing import List, Dict, Tuple
from datasets import load_metric
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from bert_score import score

# Example Hugging Face model for evaluation
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

def calculate_bleu(predictions: List[str], references: List[List[str]]) -> Dict:
    """
    Calculate BLEU score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[List[str]]): A list of reference texts (ground truth).

    Returns:
        Dict: BLEU score and additional metrics.
    """
    bleu_metric = load_metric("bleu")
    bleu_metric.add_batch(predictions=predictions, references=references)
    result = bleu_metric.compute()
    return result

def calculate_rouge(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate ROUGE score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: ROUGE scores.
    """
    rouge_metric = load_metric("rouge")
    rouge_metric.add_batch(predictions=predictions, references=references)
    result = rouge_metric.compute()
    return result

def calculate_meteor(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate METEOR score for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: METEOR score.
    """
    meteor_metric = load_metric("meteor")
    meteor_metric.add_batch(predictions=predictions, references=references)
    result = meteor_metric.compute()
    return result

def calculate_bert_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Calculate BERTScore for the model's predictions.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Precision, Recall, and F1 scores.
    """
    P, R, F1 = score(predictions, references, lang="en", verbose=True)
    return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}

def calculate_ragas_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function to calculate RAGAS (Retrieval-Augmented Generation Answer Score).

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder RAGAS scores.
    """
    # Implementation will depend on specific retrieval-augmented scoring methods
    return {"ragas_score": 0.85}  # Placeholder value

def calculate_helm_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function to calculate HELM (Holistic Evaluation of Language Models) metrics.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder HELM scores.
    """
    # Implementation requires HELM evaluation framework
    return {"helm_score": 0.9}  # Placeholder value

def calculate_gpt_score(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function to calculate GPT-Score.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder GPT-Score.
    """
    return {"gpt_score": 0.88}  # Placeholder value

def scenario_fidelity_tests(predictions: List[str], scenarios: List[str]) -> Dict:
    """
    Placeholder function to evaluate Scenario Fidelity Tests.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        scenarios (List[str]): A list of test scenarios.

    Returns:
        Dict: Placeholder scenario fidelity results.
    """
    return {"scenario_fidelity": 0.92}  # Placeholder value

def calculate_forgetting_rate(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompts: List[str]) -> float:
    """
    Placeholder function to calculate Forgetting Rate of the model over time.

    Args:
        model (AutoModelForCausalLM): The language model to test.
        tokenizer (AutoTokenizer): Tokenizer associated with the model.
        prompts (List[str]): List of input prompts.

    Returns:
        float: Placeholder forgetting rate.
    """
    return 0.05  # Placeholder value

def rimu_evaluation(predictions: List[str], references: List[str]) -> Dict:
    """
    Placeholder function for RIMU (Relevance, Integration, Memory, Usefulness) evaluation.

    Args:
        predictions (List[str]): A list of predicted texts from the model.
        references (List[str]): A list of reference texts (ground truth).

    Returns:
        Dict: Placeholder RIMU evaluation results.
    """
    return {"rimu_score": 0.87}  # Placeholder value

def problem_solving_effectiveness(predictions: List[str], problems: List[str]) -> float:
    """
    Placeholder function to calculate problem-solving effectiveness of the model.

    Args:
        predictions (List[str]): A list of predicted solutions from the model.
        problems (List[str]): A list of problems to solve.

    Returns:
        float: Placeholder effectiveness score.
    """
    return 0.91  # Placeholder value

"""
Usage Example with Hugging Face LLM Mistral:

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")

    # Sample inputs
    predictions = ["Climate change is a global challenge that requires..."]
    references = [["Climate change is a pressing issue affecting..."]]

    # BLEU
    print("BLEU Score:", calculate_bleu(predictions, references))

    # ROUGE
    print("ROUGE Score:", calculate_rouge(predictions, [r[0] for r in references]))

    # METEOR
    print("METEOR Score:", calculate_meteor(predictions, [r[0] for r in references]))

    # BERTScore
    print("BERTScore:", calculate_bert_score(predictions, [r[0] for r in references]))

    # RAGAS
    print("RAGAS Score:", calculate_ragas_score(predictions, [r[0] for r in references]))

    # HELM
    print("HELM Score:", calculate_helm_score(predictions, [r[0] for r in references]))

    # GPT-Score
    print("GPT-Score:", calculate_gpt_score(predictions, [r[0] for r in references]))

    # Scenario Fidelity
    print("Scenario Fidelity:", scenario_fidelity_tests(predictions, ["scenario example"]))
"""
