<a href="https://colab.research.google.com/github/barbaroja2000/rubric/blob/main/Rubric_Mixtral_8x22bn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Evaluating Mixtral 8x22b with GPT-4

Uses:

*   Langsmith for custom rubrik datasets & Evaluation Framework
*   Fireworks API
*   GPT-4 To evaluate LLM output

---

Test:

*  provides a quality assesment 0-5 and a rationalle for the score
* The interactive table can be used to Check the model output against the GPT-4 assesment
* Possible improvment here is to be able to manually update the score returned from the model

---

Model Mixtral 8x22b:

---

Refs:
* https://fireworks.ai/models/fireworks/mixtral-8x22b
* https://python.langchain.com/docs/langsmith/
* https://www.reddit.com/r/LocalLLaMA/comments/1c0tdsb/mixtral_8x22b_benchmarks_awesome_performance/

---

Requires:
*   Test dataset in Langsmith: This notebook shows how to set this up  https://github.com/barbaroja2000/rubric/blob/main/Langsmith_Rubric_Dataset_Creator.ipynb
*   Fireworks AI API Key
*   Langchain Key
*   OpenAI Key


In [None]:
!pip install -q langchain langsmith openai tiktoken

In [None]:
from google.colab import userdata
import os
import requests

os.environ['LANGCHAIN_ENDPOINT']= "https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY']= userdata.get('langchain_api_key')
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

fireworks_api_key = userdata.get("fireworks_api_key")


In [None]:
project_name  = "Mixttal AI -8x22b"
rubric_model = "mixtral-8x22b"
rubric_temperature=0
evaluator_model="gpt-4-1106-preview"
evaluator_temperature=0
dataset_name = "General Rubric: Extended" #Use quick to test the runner

In [None]:
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, ChatPromptTemplate

# create a string template for a System role
system_template = """
Please provide a detailed, accurate, and contextually relevant response to the following query.
Your answer should demonstrate a clear understanding of the subject matter, considering any logical,
ethical, historical, or cultural aspects involved. Ensure the response is tailored to the specific
requirements of the question, whether it involves analysis, explanation, creativity, or problem-solving.
"""

In [None]:
from langsmith import Client
client = Client()

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableMap, RunnablePassthrough
import json

def create_runnable():

  def get_fireworks(input):

    url = "https://api.fireworks.ai/inference/v1/chat/completions"
    payload = {
      "model": "accounts/fireworks/models/mixtral-8x22b-instruct-preview",
      "max_tokens": 4096,
      "temperature": 0,
      "messages": [
        {
          "role": "user",
          "content": input["input"]["question"]
        }
      ]
    }
    headers = {
      "Accept": "application/json",
      "Content-Type": "application/json",
      "Authorization": f"Bearer {fireworks_api_key}"
    }
    response = requests.request("POST", url, headers=headers, data=json.dumps(payload))

    response_json = response.json()
    return response_json["choices"][0]["message"]["content"]


  return RunnableMap({"input": RunnablePassthrough()}) | get_fireworks

In [None]:
import re
from typing import Any, Optional

from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import StringEvaluator

class QualityEvaluator(StringEvaluator):
    """An LLM-based quality  evaluator."""

    def __init__(self):
        llm = ChatOpenAI(model=evaluator_model, temperature=evaluator_temperature)

        template = """ Evaluate the answer provided from 0 to 5, with 5 being the best quality answer, and 0 where the answer is false or not provided
        --------
        QUESTION: {input}
        --------
        ANSWER: {prediction}
        --------
        Consisely reason step by step about why the given score is appropriate, then print the score at the end. At the end, repeat that score alone on a new line."""

        self.eval_chain = LLMChain.from_string(llm=llm, template=template)

    @property
    def requires_input(self) -> bool:
        return True

    @property
    def requires_reference(self) -> bool:
        return False

    @property
    def evaluation_name(self) -> str:
        return "quality"

    def _evaluate_strings(
        self,
        prediction: str,
        input: Optional[str] = None,
        reference: Optional[str] = None,
        **kwargs: Any
    ) -> dict:
        evaluator_result = self.eval_chain(
            dict(input=input, prediction=prediction), **kwargs
        )
        lines = evaluator_result["text"].strip().split("\n")
        reasoning = evaluator_result["text"].strip()
        score = lines[-1]
        if score is not None:
            score = float(score.strip())
        return {"score": score, "reasoning": reasoning}

In [None]:
import random
import string

def rando():
  # Generating a random 4-letter string
  random_string = ''.join(random.choices(string.ascii_letters, k=4))
  return random_string

In [None]:
from langchain.smith import RunEvalConfig, run_on_dataset

evaluation_config = RunEvalConfig(
    input_key="question",
    custom_evaluators = [QualityEvaluator()]
)

project_metadata = {
  rubric_model : rubric_model,
  rubric_temperature : rubric_temperature,
  evaluator_model : evaluator_model,
  evaluator_temperature: evaluator_temperature
}

run = run_on_dataset(
    client=client,
    project_metadata = project_metadata,
    project_name = f"{project_name}-{rando()}",
    dataset_name=dataset_name,
    llm_or_chain_factory=create_runnable,
    evaluation=evaluation_config,
    concurrency_level=1, #This will bork if not provided or set to anything other than 1
)

  warn_deprecated(


View the evaluation results for project 'Mistal AI -8x22b-szoo' at:
https://smith.langchain.com/o/8bebdbff-9433-40f6-b7e1-f013521a8100/datasets/dfedd53c-204a-4b69-9f64-51a61568a7fd/compare?selectedSessions=4e356e49-9575-4f66-a3f8-642f7d726835

View all tests for Dataset General Rubric: Extended at:
https://smith.langchain.com/o/8bebdbff-9433-40f6-b7e1-f013521a8100/datasets/dfedd53c-204a-4b69-9f64-51a61568a7fd
[>                                                 ] 0/123

  warn_deprecated(


[------------------------------------------------->] 123/123

In [None]:
# Extracting the required information
extracted_data = []
for key, value in run['results'].items():
    output = value["output"] if "output" in value else None
    question = value['input']['question']
    for feedback_item in value['feedback']:
        if feedback_item.key == 'quality':
            quality = feedback_item.score
            feedback = feedback_item.comment
            extracted_data.append({
                'question': question,
                'output': output,
                'quality': quality,
                'feedback': feedback
            })

In [None]:
import pandas as pd
from google.colab import data_table
df = pd.DataFrame(extracted_data)

data_table.DataTable(df, include_index=False, num_rows_per_page=10)

Unnamed: 0,question,output,quality,feedback
0,"I WAS born in the year 1632, in the city of Yo...","I was born in York in 1632, and my father, a f...",3.0,The provided answer is a detailed summary of t...
1,"I WAS born in the year 1632, in the city of Yo...","- Born in 1632 in York, England\n- Father was ...",5.0,The provided answer is a high-quality summary ...
2,"Y curiosity, in a sense, was stronger than my ...",The main themes in this text are:\n\n1. Curios...,4.0,The provided answer does a good job of summari...
3,Explain the role of a central bank in an econo...,A central bank is like the head of a country's...,4.0,The provided answer gives a simplified and und...
4,Describe the basics of how the stock market wo...,The stock market is a place where people buy a...,5.0,The answer provided is a clear and concise exp...
...,...,...,...,...
118,Analyze the emotional content of this diary en...,The emotional content of this diary entry appe...,5.0,The answer provided is assessing the emotional...
119,What is the emotional tone of this speech exce...,The emotional tone of this speech excerpt is d...,5.0,Evaluation:\n\n1. Acknowledgment of Challenges...
120,Determine the emotional tone of this message: ...,The emotional tone of this message is negative...,5.0,Evaluation of the Answer:\n\n1. The message cl...
121,Analyze the emotion in this customer review: '...,The emotion in this customer review is overwhe...,5.0,The answer provided is an accurate analysis of...


In [None]:
pd.to_numeric(df.quality).mean()

4.441463414634146

In [None]:
examples = list(client.list_examples(dataset_name=dataset_name))

In [None]:
# Creating a new DataFrame from the list
list_df = pd.DataFrame([{'question': item.inputs['question'], 'category': item.inputs['category']} for item in examples])

In [None]:
# Merging the DataFrames
merged_df = pd.merge(df, list_df, on='question')

In [None]:
# Group by 'category' and calculate the mean of 'quality' scores
avg_scores_per_category = merged_df.groupby('category')['quality'].mean()

print(avg_scores_per_category)

category
Categorization                           4.818182
Coding                                   4.346154
Creative Writing                         4.441667
Cultural and Contextual Understanding    4.800000
Emotion Analysis                         4.800000
Ethics Analysis                          4.111111
Fact Analysis                            4.833333
Logical Reasoning                        3.454545
Reading Comprehension                    4.555556
Reframing                                4.125000
Safety and Security                      5.000000
Summarization                            4.000000
Name: quality, dtype: float64
