1. Using LLM for now.
2. Replace with vLLM when you have compute and GPUs
3. Use Ollama in local

llm

In [None]:
# # CPU
# %%timeit -n 3 -r 3
# import llm

# model = llm.get_model("mistral-7b-instruct-v0")
# response = model.prompt("3 names for a pet cow",stream=False)
# print(response.text())

vLLM:- [vLLM](https://github.com/vllm-project/vllm/pull/1901)

Ollama

In [None]:
llm = Ollama(model="mistral:instruct")
response = llm("3 names for a pet cow")
print(response)

In [None]:
llm = Ollama(model="llama2")
response = llm("3 names for a pet cow")
print(response)

In [None]:
llm = Ollama(model="orca-mini")
response = llm("3 names for a pet cow")
print(response)

# Step 2: Conceptualizing

In [1]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import Ollama
import re
import random

class LLM:
    def __init__(self, name, model_name):
        self.name = name
        self.model_name = model_name
        self.skill_level = random.uniform(50, 100)  # Random initial skill level
        self.evaluations = []

    def perform_task(self, task):
        # The evaluatee performs the task
        llm = Ollama(model=self.model_name)
        response = llm(task)
        print(f"{response=}")
        return response

    def evaluate(self, original_task, task_response):
        # The evaluator scores the task response
        scoring_prompt = f"Task: '{original_task}'. Response: '{task_response}'. Please rate the response numerically based on its relevance and quality. Give a score between 0 and 10 exclusive of 0 and 10 itself."
        llm = Ollama(model=self.model_name)
        score_response = llm(scoring_prompt)
        print(f"{score_response}")
        return self.extract_numerical_score(score_response)

    def extract_numerical_score(self, response_text):
        # Extract numbers between 1 and 10 and calculate average
        matches = re.findall(r'\b(?:[1-9](?:\.\d+)?|10)\b', response_text)
        print(f"{matches=}")
        if matches:
            scores = [int(match) for match in matches]
            average_score = sum(scores) / len(scores)
            return average_score
        else:
            return 0  # Default to 0 if no matches found

    def update_skill_level(self):
        # Update skill level based on evaluations
        if self.evaluations:
            new_skill_level = sum(self.evaluations) / len(self.evaluations)
            change_in_skill_level = abs(new_skill_level - self.skill_level)
            self.skill_level = new_skill_level
            self.evaluations = []  # Reset for next round
            return change_in_skill_level
        return 0

def normalize_skill_levels(llms):
    # Normalize skill levels to a range of 0 to 100
    skill_levels = [llm.skill_level for llm in llms]
    min_skill = min(skill_levels)
    max_skill = max(skill_levels)
    for llm in llms:
        llm.skill_level = 100 * (llm.skill_level - min_skill) / (max_skill - min_skill) if max_skill != min_skill else 50

# Model names
model_names = ["mistral:instruct", "orca-mini", "llama2"]

# Create LLM instances
llms = [LLM(f"LLM{i}", model_name) for i, model_name in enumerate(model_names, start=1)]

# Evaluation task
task = "3 names for a pet cow"

# Convergence threshold
threshold = 0.5
converged = False

while not converged:
    max_change = 0
    for evaluator in llms:
        for evaluatee in llms:
            if evaluator != evaluatee:
                print(f"{evaluator.model_name=},{evaluatee.model_name=}")
                task_response = evaluatee.perform_task(task)
                score = evaluator.evaluate(task, task_response)
                weighted_score = score * evaluator.skill_level
                evaluatee.evaluations.append(weighted_score)

    # Normalize skill levels
    normalize_skill_levels(llms)

    # Check for convergence
    changes = [llm.update_skill_level() for llm in llms]
    max_change = max(changes)
    converged = max_change < threshold
    converged = True # for debugging purpose

# Print final skill levels
for llm in llms:
    print(f"{llm.model_name} final skill level: {llm.skill_level}")


evaluator.model_name='mistral:instruct',evaluatee.model_name='orca-mini'
response=' 1. Bessie\n2. Daisy\n3. Rosie'

I would rate this response a 7 out of 10. The names suggested are typical and common for pets, but they are relevant to the task of naming a pet cow. However, the response could have been more creative or unique, which would have made it more memorable and interesting.
matches=['7', '10']
evaluator.model_name='mistral:instruct',evaluatee.model_name='llama2'
response='\nSure, here are three names for a pet cow:\n\n1. Bessie - a classic and timeless name for a pet cow.\n2. Daisy - a sweet and playful name that evokes images of green pastures and sunny days.\n3. Buttercup - a charming and affectionate name that is perfect for a friendly and loving pet cow.'

I would give the response a score of 7 out of 10. While the names suggested are certainly appropriate for pet cows, the response is brief and doesn't provide much additional context or information. Additionally, there co

# Production Quality Refactor

In [None]:
import re
import random
from langchain.llms import Ollama
import logging

# Set up basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class LLM:
    def __init__(self, name, model_name):
        self.name = name
        self.model_name = model_name
        self.skill_level = random.uniform(50, 100)
        self.evaluations = []

    def perform_task(self, task):
        try:
            llm = Ollama(model=self.model_name)
            response = llm(task)
            logging.info(f"Response from {self.name}: {response}")
            return response
        except Exception as e:
            logging.error(f"Error in perform_task for {self.name}: {e}")
            return None

    def evaluate(self, original_task, task_response):
        try:
            scoring_prompt = f"Task: '{original_task}'. Response: '{task_response}'. Rate the response numerically between 1 and 9."
            llm = Ollama(model=self.model_name)
            score_response = llm(scoring_prompt)
            logging.info(f"Evaluation by {self.name}: {score_response}")
            return self.extract_numerical_score(score_response)
        except Exception as e:
            logging.error(f"Error in evaluate for {self.name}: {e}")
            return 0

    def extract_numerical_score(self, response_text):
        try:
            matches = re.findall(r'\b(?:[1-9](?:\.\d+)?|10)\b', response_text)
            if matches:
                scores = [float(match) for match in matches]
                average_score = sum(scores) / len(scores)
                return average_score
            else:
                return 0
        except Exception as e:
            logging.error(f"Error in extract_numerical_score: {e}")
            return 0

    def update_skill_level(self):
        try:
            if self.evaluations:
                new_skill_level = sum(self.evaluations) / len(self.evaluations)
                change_in_skill_level = abs(new_skill_level - self.skill_level)
                self.skill_level = new_skill_level
                self.evaluations = []
                return change_in_skill_level
            return 0
        except Exception as e:
            logging.error(f"Error in update_skill_level for {self.name}: {e}")
            return 0

def normalize_skill_levels(llms):
    try:
        skill_levels = [llm.skill_level for llm in llms]
        min_skill = min(skill_levels)
        max_skill = max(skill_levels)
        for llm in llms:
            llm.skill_level = 100 * (llm.skill_level - min_skill) / (max_skill - min_skill) if max_skill != min_skill else 50
    except Exception as e:
        logging.error(f"Error in normalize_skill_levels: {e}")

def rank_llms(llms):
    sorted_llms = sorted(llms, key=lambda x: x.skill_level, reverse=True)
    logging.info("Final Ranking of LLMs:")
    for rank, llm in enumerate(sorted_llms, start=1):
        logging.info(f"Rank {rank}: {llm.name} with skill level {llm.skill_level}")

# Model names
model_names = ["mistral:instruct", "orca-mini", "llama2"]

# Create LLM instances
llms = [LLM(f"LLM{i}", model_name) for i, model_name in enumerate(model_names, start=1)]

# Evaluation task
task = "3 names for a pet cow"

# Convergence threshold
threshold = 0.5
converged = False

while not converged:
    max_change = 0
    for evaluator in llms:
        for evaluatee in llms:
            if evaluator != evaluatee:
                task_response = evaluatee.perform_task(task)
                if task_response:
                    score = evaluator.evaluate(task, task_response)
                    weighted_score = score * evaluator.skill_level
                    evaluatee.evaluations.append(weighted_score)

    # Normalize skill levels
    normalize_skill_levels(llms)

    # Check for convergence
    changes = [llm.update_skill_level() for llm in llms]
    max_change = max(changes)
    converged = max_change < threshold

# Rank the LLMs based on their final skill levels
rank_llms(llms)
