In [1]:
!pip -qqq install pip --progress-bar off
!pip -qqq install groq==0.9.0 --progress-bar off
!pip -qqq install pydantic==2.7.4 --progress-bar off
!pip -qqq install langchain-core==0.2.7 --progress-bar off

In [9]:
import json
import os
from enum import Enum
from typing import Generic, Type, TypeVar

import groq
from google.colab import userdata
from groq import Groq
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel
from IPython.display import Markdown

os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")
MODEL = "llama3-70b-8192"

In [3]:
tweet = """
How do you choose which LLM to use?

A vibe check ain't going to cut it.

I'm trying DeepEval (@jeffr_yyy)

- Wide range of metrics: Relevancy, Hallucination & more
- Bulk & real-time evaluation
- CI/CD integration
- Benchmarking on popular datasets
"""

In [4]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))


class ResponseFormat(Enum):
    JSON = "json"
    TEXT = "text"


def predict(
    prompt: str,
    system_prompt: str = None,
    response_format: ResponseFormat = ResponseFormat.TEXT,
    model: str = MODEL,
    client: Groq = client,
):
    messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    if system_prompt:
        messages.insert(
            0,
            {
                "role": "system",
                "content": system_prompt,
            },
        )
    try:
        chat_completion = client.chat.completions.create(
            messages=messages,
            model=model,
            temperature=0.00001,
            response_format={
                "type": "json_object"
                if response_format == ResponseFormat.JSON
                else "text"
            },
        )
        return chat_completion.choices[0].message.content
    except groq.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)
    except groq.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
    except groq.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.message)
        print(e.response)

## JSON Output Support

In [5]:
system_prompt = """
You're evaluating writing style in text.
Your evalutions must always be in JSON format. Here is an example JSON response:

```
{
  'readability': 4,
  'conciseness': 2
}
```
"""

print(system_prompt)


You're evaluating writing style in text.
Your evalutions must always be in JSON format. Here is an example JSON response:

```
{
  'readability': 4,
  'conciseness': 2
}
```



In [6]:
Markdown(system_prompt)


You're evaluating writing style in text.
Your evalutions must always be in JSON format. Here is an example JSON response:

```
{
  'readability': 4,
  'conciseness': 2
}
```


In [7]:
prompt = f"""
Evaluate the text:

```
{tweet}
```

You're evaluating the readability and conciseness with values from 0 (extremely bad) to 10 (extremely good)
"""
print(prompt)


Evaluate the text:

```

How do you choose which LLM to use?

A vibe check ain't going to cut it.

I'm trying DeepEval (@jeffr_yyy)

- Wide range of metrics: Relevancy, Hallucination & more
- Bulk & real-time evaluation
- CI/CD integration
- Benchmarking on popular datasets

```

You're evaluating the readability and conciseness with values from 0 (extremely bad) to 10 (extremely good)



In [8]:
Markdown(prompt)


Evaluate the text:

```

How do you choose which LLM to use?

A vibe check ain't going to cut it.

I'm trying DeepEval (@jeffr_yyy)

- Wide range of metrics: Relevancy, Hallucination & more
- Bulk & real-time evaluation
- CI/CD integration
- Benchmarking on popular datasets

```

You're evaluating the readability and conciseness with values from 0 (extremely bad) to 10 (extremely good)


In [10]:
response = predict(prompt, system_prompt, response_format=ResponseFormat.JSON)

In [11]:
print(response)

{
"readability": 8,
"conciseness": 6
}


In [12]:
Markdown(response)

{
"readability": 8,
"conciseness": 6
}

## No JSON Output Support

In [13]:
class WritingScore(BaseModel):
    readability: int
    conciseness: int

In [14]:
schema = {k: v for k, v in WritingScore.schema().items()}
schema = {"properties": schema["properties"], "required": schema["required"]}
print(json.dumps(schema, indent=2))

{
  "properties": {
    "readability": {
      "title": "Readability",
      "type": "integer"
    },
    "conciseness": {
      "title": "Conciseness",
      "type": "integer"
    }
  },
  "required": [
    "readability",
    "conciseness"
  ]
}


In [15]:
Markdown(json.dumps(schema, indent=2))

{
  "properties": {
    "readability": {
      "title": "Readability",
      "type": "integer"
    },
    "conciseness": {
      "title": "Conciseness",
      "type": "integer"
    }
  },
  "required": [
    "readability",
    "conciseness"
  ]
}

In [16]:
OUTPUT_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {{"properties": {{"foo": {{"title": "Foo", "description": "a list of strings", "type": "array", "items": {{"type": "string"}}}}}}, "required": ["foo"]}}
the object {{"foo": ["bar", "baz"]}} is a well-formatted instance of the schema. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not well-formatted.

Here is the output schema:

```
{schema}
```

Do not return any preamble or explanations, return only a pure JSON string surrounded by triple backticks (```)."""

In [17]:
json_instruction = OUTPUT_FORMAT_INSTRUCTIONS.format(
    schema=json.dumps(schema, indent=2)
)
print(json_instruction)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:

```
{
  "properties": {
    "readability": {
      "title": "Readability",
      "type": "integer"
    },
    "conciseness": {
      "title": "Conciseness",
      "type": "integer"
    }
  },
  "required": [
    "readability",
    "conciseness"
  ]
}
```

Do not return any preamble or explanations, return only a pure JSON string surrounded by triple backticks (```).


In [18]:
Markdown(json_instruction)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:

```
{
  "properties": {
    "readability": {
      "title": "Readability",
      "type": "integer"
    },
    "conciseness": {
      "title": "Conciseness",
      "type": "integer"
    }
  },
  "required": [
    "readability",
    "conciseness"
  ]
}
```

Do not return any preamble or explanations, return only a pure JSON string surrounded by triple backticks (```).

In [19]:
prompt = f"""
Evaluate the writing style from the text:

```
{tweet}
```

Evaluate the readability and conciseness with values from 0 (extremely bad) to 10 (extremely good)

{json_instruction}
"""
print(prompt)


Evaluate the writing style from the text:

```

How do you choose which LLM to use?

A vibe check ain't going to cut it.

I'm trying DeepEval (@jeffr_yyy)

- Wide range of metrics: Relevancy, Hallucination & more
- Bulk & real-time evaluation
- CI/CD integration
- Benchmarking on popular datasets

```

Evaluate the readability and conciseness with values from 0 (extremely bad) to 10 (extremely good)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:

```
{
  "properties": {
    "readability": {
      "title": "Readability",
      "type": "integer"
    },
    "conciseness": {
      "title": "Conc

In [20]:
Markdown(prompt)


Evaluate the writing style from the text:

```

How do you choose which LLM to use?

A vibe check ain't going to cut it.

I'm trying DeepEval (@jeffr_yyy)

- Wide range of metrics: Relevancy, Hallucination & more
- Bulk & real-time evaluation
- CI/CD integration
- Benchmarking on popular datasets

```

Evaluate the readability and conciseness with values from 0 (extremely bad) to 10 (extremely good)

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:

```
{
  "properties": {
    "readability": {
      "title": "Readability",
      "type": "integer"
    },
    "conciseness": {
      "title": "Conciseness",
      "type": "integer"
    }
  },
  "required": [
    "readability",
    "conciseness"
  ]
}
```

Do not return any preamble or explanations, return only a pure JSON string surrounded by triple backticks (```).


In [21]:
response = predict(prompt, MODEL)
print(response)

```
{
  "readability": 8,
  "conciseness": 9
}
```


In [22]:
Markdown(response)

```
{
  "readability": 8,
  "conciseness": 9
}
```

## From Scratch

In [23]:
response_json = json.loads(response.strip("```"))
WritingScore.parse_obj(response_json)

WritingScore(readability=8, conciseness=9)

In [24]:
TBaseModel = TypeVar("TBaseModel", bound=BaseModel)


class JsonOutputParser(Generic[TBaseModel]):
    def __init__(self, pydantic_object: Type[TBaseModel]):
        self.pydantic_object = pydantic_object

    def parse(self, response: str):
        response_json = json.loads(response.strip("```"))
        return self.pydantic_object.parse_obj(response_json)

In [25]:
JsonOutputParser(pydantic_object=WritingScore).parse(response)

WritingScore(readability=8, conciseness=9)

## Using LangChain Parser

In [26]:
parser = PydanticOutputParser(pydantic_object=WritingScore)

In [27]:
parser.parse(response)

WritingScore(readability=8, conciseness=9)