In [1]:
from datasets import Dataset
from distilabel.llm import OpenAILLM
from distilabel.pipeline import Pipeline

from dotenv import load_dotenv

load_dotenv("../.env")


  from .autonotebook import tqdm as notebook_tqdm


True

# Self Instruct

In [136]:
use_cases = [
    "User representation: A user profile is represented as a JSON schema.",
    "User representation: A user profile is represented as a Pydantic class.",
    "Company representation: A company profile is represented as a Pydantic class.",
    "Message representation: A message is represented as a Pydantic class.",
    "Booking representation: A booking is represented as a Pydantic class.",
    "Novel representation: A novel is represented as a Pydantic class.",
    "Author representation: An author is represented as a Pydantic class.",
    "Software representation: A software application is represented as a Pydantic class.",
]

use_case_dataset = Dataset.from_dict({"input": use_cases})

# JSON Generation task

In [32]:
import json

from typing import Dict

from distilabel.tasks import TextGenerationTask

In [12]:
use_cases = [
    "User representation: A user profile is represented as a JSON schema.",
]
use_case_dataset = Dataset.from_dict({"input": use_cases})

In [98]:
class JsonTask(TextGenerationTask):
    system_prompt = "You are an AI assistant that responds only using JSON structures, no explanation or anything else that isn't a JSON."

    @property
    def output_args_names(self):
        return ["generations"]

    def parse_output(self, output: str) -> Dict[str, str]:
        json.loads(output)
        return {"generations": output}
    
generator_task = JsonTask(
    system_prompt = (
        "You an expert JSON schema developer, specialising in robust JSON schemas."
        "You are given a use case for a specific software application entity."
        "You represent the software application entity as as JSON schemas."
        "You write only JSON schemas and do not introduce the code with prose."
        ),
)
instruction_generator = OpenAILLM(
    task=generator_task,
    num_threads=1,
    max_new_tokens=1024,
    model="gpt-3.5-turbo",
)

In [99]:
num_generations = 1
batch_size = 1
pipeline = Pipeline(generator=instruction_generator)
distiset = pipeline.generate(
    dataset=use_case_dataset, num_generations=num_generations, batch_size=batch_size
)

Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 637.04 examples/s]


Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 296.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 321.13 examples/s]


In [100]:
distiset

Dataset({
    features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'generations'],
    num_rows: 1
})

In [101]:
from rich import print
# get a sample and log it clearly
prompt_samples = [sample["generation_prompt"] for sample in distiset][0][0]
raw_generation_responses = distiset["generations"][0][0]

for p in prompt_samples:
    print(p)
print(raw_generation_responses)

In [102]:
import json

json.loads(raw_generation_responses)

{'$schema': 'http://json-schema.org/draft-07/schema#',
 'title': 'User Profile',
 'type': 'object',
 'properties': {'id': {'type': 'string', 'format': 'uuid'},
  'username': {'type': 'string'},
  'email': {'type': 'string', 'format': 'email'},
  'firstName': {'type': 'string'},
  'lastName': {'type': 'string'},
  'dateOfBirth': {'type': 'string', 'format': 'date'},
  'address': {'type': 'object',
   'properties': {'street': {'type': 'string'},
    'city': {'type': 'string'},
    'state': {'type': 'string'},
    'country': {'type': 'string'},
    'postalCode': {'type': 'string'}},
   'required': ['street', 'city', 'country', 'postalCode']},
  'phoneNumbers': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'type': {'type': 'string',
      'enum': ['home', 'work', 'mobile']},
     'number': {'type': 'string'}},
    'required': ['type', 'number']}},
  'isActive': {'type': 'boolean'},
  'createdAt': {'type': 'string', 'format': 'date-time'},
  'updatedAt': {'type': 'stri

# Use a labeller to judge the quality of the generated JSONs

In [103]:
from textwrap import dedent

from distilabel.tasks.preference.ultrafeedback import Rating, UltraFeedbackTask

task_description = dedent(
    """
    # JSON Schema Validity Assessment
    Evaluate the model's json schema based on various criteria:
    1. **Correctness**: Does the output provide accurate and relevant examples within the JSON fields?
    2. **Instruction Following**: Does the JSON align with given instructions and the user's intent?
    3. **Completeness**: Does the JSON schema represent the instruction fully?

    **Scoring**: Rate outputs 1 to 3 based on the overall quality, considering all aspects:
    """
)

ratings = [
    Rating(value=1, description="The JSON schema is incomplete and does not represent the instruction."),
    Rating(value=2, description="The JSON schema is complete but field, descriptions, and examples should be improved."),
    Rating(value=3, description="The JSON schema is complete and represents the instruction fully."),
]

ultrafeedback_task = UltraFeedbackTask(
    system_prompt="Your role is to evaluate text quality based on given criteria",
    task_description=task_description,
    ratings=ratings,
)

In [104]:
labeller = OpenAILLM(
    task=ultrafeedback_task,
    num_threads=1,
    max_new_tokens=1024,
    model="gpt-4",
)

In [105]:
generator_task = JsonTask(
    system_prompt = (
        "You an expert JSON schema developer, specialising in robust JSON schemas."
        "You are given a use case for a specific software application entity."
        "You represent the software application entity as as JSON schemas."
        "You write only JSON schemas and do not introduce the code with prose."
        ),
)
instruction_generator = OpenAILLM(
    task=generator_task,
    num_threads=1,
    max_new_tokens=1024,
    model="gpt-3.5-turbo",
)

pipeline = Pipeline(generator=instruction_generator, labeller=labeller)

In [137]:
num_generations = 10
batch_size = 10
distiset = pipeline.generate(
    dataset=use_case_dataset, num_generations=num_generations, batch_size=batch_size
)

  return self._generate(


Flattening the indices: 100%|██████████| 1/1 [00:00<00:00, 271.55 examples/s]


In [107]:
distiset

Dataset({
    features: ['input', 'generation_model', 'generation_prompt', 'raw_generation_responses', 'generations', 'labelling_model', 'labelling_prompt', 'raw_labelling_response', 'rating', 'rationale'],
    num_rows: 1
})

In [110]:
print(
    distiset["generation_prompt"],
    distiset["generations"],
    distiset["raw_labelling_response"],
    distiset["rating"],
    sep="\n",
)

# use agent tools

In [115]:
from langchain import hub
from langchain.agents import AgentExecutor
from langchain_experimental.tools import PythonREPLTool

In [116]:
tools = [PythonREPLTool()]

In [117]:
from langchain.agents import create_openai_functions_agent
from langchain_openai import ChatOpenAI

In [119]:
instructions = """You are an agent designed to write and execute python code to answer questions.
You have access to a python REPL, which you can use to execute python code.
If you get an error, debug your code and try again.
Only use the output of your code to answer the question. 
You might know the answer without running any code, but you should still run the code to get the answer.
If it does not seem like you can write code to answer the question, just return "I don't know" as the answer.
"""

base_prompt = hub.pull("langchain-ai/openai-functions-template")
prompt = base_prompt.partial(instructions=instructions)

In [120]:
agent = create_openai_functions_agent(ChatOpenAI(temperature=0), tools, prompt)

In [121]:
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [135]:
bad_schema = """{
    "type": "object",
    "properties": {
        "name": {
            "type": "string"
        },
            "type": "number"
        }
    }
}
"""

try:
    json.loads(bad_schema)
except Exception as e:
    print(e)

agent_executor.invoke({
    "input": f"""Repair this json object: {bad_schema}""",
    "output": None,
    "code": None
})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe JSON object you provided is not valid. The "type" property is duplicated within the "properties" object. To fix it, you can modify the JSON object as follows:

```json
{
    "type": "object",
    "properties": {
        "name": {
            "type": "string"
        },
        "age": {
            "type": "number"
        }
    }
}
```

In this fixed version, I assumed that you wanted to add an "age" property of type "number" to the "properties" object.[0m

[1m> Finished chain.[0m


{'input': 'Repair this json object: {\n    "type": "object",\n    "properties": {\n        "name": {\n            "type": "string"\n        },\n            "type": "number"\n        }\n    }\n}\n',
 'output': 'The JSON object you provided is not valid. The "type" property is duplicated within the "properties" object. To fix it, you can modify the JSON object as follows:\n\n```json\n{\n    "type": "object",\n    "properties": {\n        "name": {\n            "type": "string"\n        },\n        "age": {\n            "type": "number"\n        }\n    }\n}\n```\n\nIn this fixed version, I assumed that you wanted to add an "age" property of type "number" to the "properties" object.',
 'code': None}

In [134]:
json.loads("""{\n    "type": "object",\n    "properties": {\n        "name": {\n            "type": "string"\n        },\n        "age": {\n            "type": "number"\n        }\n    }\n}""")

{'type': 'object',
 'properties': {'name': {'type': 'string'}, 'age': {'type': 'number'}}}