<a href="https://colab.research.google.com/github/calmrocks/master-machine-learning-engineer/blob/main/modules/generative-ai/materials/notebooks/llm_benchmark_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLM Foundation Models Comparison

This notebook compares different LLM foundation models across various aspects including:
- Response quality
- Performance metrics
- Resource usage
- Practical considerations

We'll test models from different providers:
- OpenAI (GPT-3.5, GPT-4)
- Anthropic (Claude-2)
- Meta (Llama-2)
- Mistral AI (Mistral-7B)

### Setup Required Libraries
Import necessary packages for model interaction, data processing, and visualization


In [3]:
!pip install anthropic


Collecting anthropic
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Downloading anthropic-0.49.0-py3-none-any.whl (243 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/243.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/243.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.4/243.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.49.0


In [4]:
import openai
import anthropic
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import time
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict
import json


### Model Configuration
Define the models to compare and their configurations

In [5]:
class ModelConfig:
    def __init__(self, name, api_key=None):
        self.name = name
        self.api_key = api_key

models_to_compare = {
    "gpt-3.5-turbo": {"type": "api", "provider": "openai"},
    "gpt-4": {"type": "api", "provider": "openai"},
    "claude-2": {"type": "api", "provider": "anthropic"},
    "llama-2-7b": {"type": "local", "provider": "meta"},
    "mistral-7b": {"type": "local", "provider": "mistral"}
}

### Test Cases Definition
Define various test cases across different categories to evaluate model performance


In [6]:
test_cases = [
    {
        "category": "General Knowledge",
        "prompts": [
            "What is the capital of France?",
            "Explain quantum computing in simple terms",
        ]
    },
    {
        "category": "Code Generation",
        "prompts": [
            "Write a Python function to check if a string is palindrome",
            "Create a simple REST API using Flask",
        ]
    },
    {
        "category": "Creative Writing",
        "prompts": [
            "Write a short story about a time traveler",
            "Create a poem about artificial intelligence",
        ]
    }
]

### Evaluation Metrics
Define functions to evaluate model responses based on different criteria


In [7]:
def evaluate_response(response: str, criteria: Dict) -> Dict:
    """
    Evaluate model response based on different criteria
    """
    results = {
        "length": len(response),
        "response_time": criteria.get("response_time", 0),
        # Add more metrics as needed
    }
    return results

### Model Wrapper Class
Create a wrapper class to handle different types of models consistently


In [8]:
class LLMWrapper:
    def __init__(self, model_name: str, model_config: Dict):
        self.model_name = model_name
        self.model_config = model_config
        self.setup_model()

    def setup_model(self):
        if self.model_config["type"] == "api":
            if self.model_config["provider"] == "openai":
                self.client = openai.OpenAI()
            elif self.model_config["provider"] == "anthropic":
                self.client = anthropic.Anthropic()
        else:
            # Load local models
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(self.model_name)

    def generate(self, prompt: str) -> Dict:
        start_time = time.time()

        try:
            if self.model_config["type"] == "api":
                if self.model_config["provider"] == "openai":
                    response = self.client.chat.completions.create(
                        model=self.model_name,
                        messages=[{"role": "user", "content": prompt}]
                    )
                    result = response.choices[0].message.content

                elif self.model_config["provider"] == "anthropic":
                    response = self.client.messages.create(
                        model=self.model_name,
                        messages=[{"role": "user", "content": prompt}]
                    )
                    result = response.content
            else:
                # Local model inference
                inputs = self.tokenizer(prompt, return_tensors="pt")
                outputs = self.model.generate(**inputs)
                result = self.tokenizer.decode(outputs[0])

            end_time = time.time()

            return {
                "response": result,
                "response_time": end_time - start_time,
                "status": "success"
            }

        except Exception as e:
            return {
                "response": str(e),
                "response_time": time.time() - start_time,
                "status": "error"
            }

### Model Comparison Class
Define class to run comparisons across all models and generate reports


In [11]:
class ModelComparison:
    def __init__(self, models_config: Dict):
        self.models = {
            name: LLMWrapper(name, config)
            for name, config in models_config.items()
        }
        self.results = []

    def run_comparison(self, test_cases: List[Dict]):
        for case in test_cases:
            for prompt in case["prompts"]:
                for model_name, model in self.models.items():
                    result = model.generate(prompt)
                    evaluation = evaluate_response(
                        result["response"],
                        {"response_time": result["response_time"]}
                    )

                    self.results.append({
                        "model": model_name,
                        "category": case["category"],
                        "prompt": prompt,
                        "response": result["response"],
                        "metrics": evaluation
                    })

    def generate_report(self):
        df = pd.DataFrame(self.results)
        return df

    def plot_comparisons(self):
        df = self.generate_report()

        # Response time comparison
        plt.figure(figsize=(12, 6))
        df.boxplot(column="metrics.response_time", by="model")
        plt.title("Response Time Comparison")
        plt.show()

### Execute Comparison
Run the comparison across all models and test cases


In [None]:
comparison = ModelComparison(models_to_compare)
comparison.run_comparison(test_cases)


### Generate and Display Results
Create reports and visualizations from the comparison results


In [13]:
# Generate report
results_df = comparison.generate_report()

# Display results
print("Summary Statistics:")
print(results_df.groupby('model')['metrics.response_time'].describe())

# Plot comparisons
comparison.plot_comparisons()

# Export results
results_df.to_csv("llm_comparison_results.csv")

NameError: name 'comparison' is not defined