## DeepEval

Custom evaluator class for Llama3: https://docs.confident-ai.com/docs/guides-using-custom-llms

In [1]:
import transformers
import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

from deepeval.models import DeepEvalBaseLLM

class CustomLlama3_8B(DeepEvalBaseLLM):
    def __init__(self): 

        model = AutoModelForCausalLM.from_pretrained(
            #"meta-llama/Meta-Llama-3.1-8B-Instruct",
            "meta-llama/Llama-2-7b-hf",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "meta-llama/Llama-2-7b-hf"
        )

        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        pipeline = transformers.pipeline(
            "text-generation",
            model=model,
            tokenizer=self.tokenizer,
            use_cache=True,
            device_map="auto",
            max_length=2500,
            do_sample=True,
            top_k=5,
            num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            pad_token_id=self.tokenizer.eos_token_id,
        )

        return pipeline(prompt)

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Llama-2 7B"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
custom_llm = CustomLlama3_8B()
print(custom_llm.generate("Write me a joke"))

Downloading shards: 100%|██████████| 2/2 [13:15<00:00, 397.78s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:20<00:00, 10.20s/it]
Some parameters are on the meta device because they were offloaded to the disk.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: 

# Semantic Similarity Evaluation

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

# Load the Excel file
data = pd.read_excel('data/QA_Pairs.xlsx')  # Replace with your Excel file path

# Initialize models
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
rouge = Rouge()

# Results list
results = []

# Iterate through each row in the dataset
for index, row in data.iterrows():
    question = row['Question']
    expected = row['Expected Answer']
    given = row['Answer']

    # Compute semantic similarity
    expected_embedding = model.encode([expected])
    given_embedding = model.encode([given])
    semantic_similarity = cosine_similarity([expected_embedding[0]], [given_embedding[0]])[0][0]

    # Compute BLEU score
    bleu_score = sentence_bleu([expected.split()], given.split())

    # Compute ROUGE score
    rouge_score = rouge.get_scores(given, expected, avg=True)

    # Append results
    results.append({
        'Question': question,
        'Expected Answer': expected,
        'Given Answer': given,
        'Semantic Similarity': semantic_similarity,
        'BLEU Score': bleu_score,
        'ROUGE-1': rouge_score['rouge-1']['f'],
        'ROUGE-2': rouge_score['rouge-2']['f'],
        'ROUGE-L': rouge_score['rouge-l']['f']
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to a new Excel file
results_df.to_excel('evaluation_results.xlsx', index=False)

print("Evaluation completed. Results saved to 'evaluation_results.xlsx'.")

Evaluation completed. Results saved to 'evaluation_results.xlsx'.
