**Installing dependencies**

In [1]:
pip install anthropic pandas transformers torch nltk rouge scikit-learn numpy

Collecting anthropic
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cu

**Setting up the environment & model for evaluation**

In [2]:
import anthropic
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

# Set up Claude API key
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Load an open-source LLM model (e.g., distilgpt2)
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
open_source_llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu


**Function to get response from Claude 3.5 Haiku**

In [3]:
def get_claude_response(prompt):
    response = client.messages.create(
        model="claude-3-haiku-20240307",  # Updated model identifier
        max_tokens=500,
        messages=[{"role": "user", "content": prompt}]
    )
    # Claude API response structure has changed
    return response.content[0].text if isinstance(response.content, list) else response.content

**Function to get response from the open-source model**

In [4]:
def get_open_source_llm_response(prompt):
    response = open_source_llm_pipeline(prompt, max_length=100, num_return_sequences=1)
    return response[0]['generated_text']

**Function to calculate BLEU score**

In [5]:
def calculate_bleu(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    return sentence_bleu([reference_tokens], candidate_tokens)

**Function to calculate METEOR score**

In [6]:
def calculate_meteor(reference, candidate):
    reference_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    return meteor_score([reference_tokens], candidate_tokens)

**Function to calculate ROUGE scores**

In [7]:
def calculate_rouge(reference, candidate):
    rouge = Rouge()
    try:
        scores = rouge.get_scores(candidate, reference)
        return {
            'rouge-1': scores[0]['rouge-1']['f'],
            'rouge-2': scores[0]['rouge-2']['f'],
            'rouge-l': scores[0]['rouge-l']['f']
        }
    except:
        # Sometimes ROUGE fails with very short texts
        return {'rouge-1': 0, 'rouge-2': 0, 'rouge-l': 0}

**Function to calculate cosine similarity using TF-IDF**

In [8]:
def calculate_cosine_similarity(reference, candidate):
    tfidf_vectorizer = TfidfVectorizer()
    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform([reference, candidate])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    except:
        return 0

**Create a DataFrame to store responses and metrics**

In [9]:
df = pd.DataFrame(columns=[
    "Prompt",
    "Claude Response",
    "Open-Source LLM Response",
    "BLEU Score",
    "METEOR Score",
    "ROUGE-1",
    "ROUGE-2",
    "ROUGE-L",
    "Cosine Similarity"
])

**Example prompts**

In [10]:
prompts = [
    "What is artificial intelligence?",
    "Explain the difference between machine learning and deep learning.",
    "How does natural language processing work?",
    "What are the ethical concerns with AI development?",
    "Describe the concept of neural networks.",
    "What is transfer learning in AI?",
    "Explain how recommendation systems work."
]

**Generate responses and calculate metrics**

In [11]:
for prompt in prompts:
    claude_resp = get_claude_response(prompt)
    llm_resp = get_open_source_llm_response(prompt)

    # Calculate evaluation metrics
    bleu = calculate_bleu(claude_resp, llm_resp)
    meteor = calculate_meteor(claude_resp, llm_resp)
    rouge_scores = calculate_rouge(claude_resp, llm_resp)
    cosine_sim = calculate_cosine_similarity(claude_resp, llm_resp)

    # Append results to the DataFrame
    df = pd.concat([df, pd.DataFrame([{
        "Prompt": prompt,
        "Claude Response": claude_resp,
        "Open-Source LLM Response": llm_resp,
        "BLEU Score": bleu,
        "METEOR Score": meteor,
        "ROUGE-1": rouge_scores['rouge-1'],
        "ROUGE-2": rouge_scores['rouge-2'],
        "ROUGE-L": rouge_scores['rouge-l'],
        "Cosine Similarity": cosine_sim
    }])], ignore_index=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  df = pd.concat([df, pd.DataFrame([{
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generat

**Calculate average scores**

In [12]:
avg_metrics = {
    "BLEU Score": df["BLEU Score"].mean(),
    "METEOR Score": df["METEOR Score"].mean(),
    "ROUGE-1": df["ROUGE-1"].mean(),
    "ROUGE-2": df["ROUGE-2"].mean(),
    "ROUGE-L": df["ROUGE-L"].mean(),
    "Cosine Similarity": df["Cosine Similarity"].mean()
}

**Add a summary row**

In [14]:
summary_row = pd.DataFrame([{
    "Prompt": "AVERAGE",
    "Claude Response": "",
    "Open-Source LLM Response": "",
    "BLEU Score": avg_metrics["BLEU Score"],
    "METEOR Score": avg_metrics["METEOR Score"],
    "ROUGE-1": avg_metrics["ROUGE-1"],
    "ROUGE-2": avg_metrics["ROUGE-2"],
    "ROUGE-L": avg_metrics["ROUGE-L"],
    "Cosine Similarity": avg_metrics["Cosine Similarity"]
}])

df = pd.concat([df, summary_row], ignore_index=True)

**Save detailed results to CSV**

In [15]:
df.to_csv("llm_comparison_with_metrics.csv", index=False)

**Create a metrics-only summary DataFrame**

In [16]:
metrics_df = pd.DataFrame([avg_metrics])
metrics_df.to_csv("llm_metrics_summary.csv", index=False)

print("Responses and metrics saved to CSV!")
print("\nMetrics Summary:")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

Responses and metrics saved to CSV!

Metrics Summary:
BLEU Score: 0.0005
METEOR Score: 0.0677
ROUGE-1: 0.1350
ROUGE-2: 0.0129
ROUGE-L: 0.1252
Cosine Similarity: 0.3114
