In [35]:
import argilla as rg

client = rg.Argilla(api_key="argilla.apikey", api_url="http://localhost:6900")

In [36]:
from datetime import datetime

dataset = rg.Dataset(
    name=f"triggers_{datetime.now().strftime('%Y%m%d%H%M%S')}",
    settings=rg.Settings(
        fields=[
            rg.TextField("persona"),
            rg.TextField("instruction"),
            rg.TextField("response1"),
            rg.TextField("response2"),
        ],
        questions=[
            rg.LabelQuestion(name="respond", labels=["yes", "no"], required=True),
            rg.TextQuestion(name="improved_instruction", required=False),
            rg.TextQuestion(name="response1_rationale", required=False),
            rg.TextQuestion(name="response2_rationale", required=False),
            rg.RatingQuestion(
                name="response1_rating", values=[1, 2, 3, 4, 5], required=False
            ),
            rg.RatingQuestion(
                name="response2_rating", values=[1, 2, 3, 4, 5], required=False
            ),
        ],
    ),
)

In [37]:
dataset.create()

Dataset(id=UUID('7d528756-d877-4b35-9c29-69acf82a657e') inserted_at=datetime.datetime(2024, 8, 14, 8, 8, 17, 956135) updated_at=datetime.datetime(2024, 8, 14, 8, 8, 18, 178416) name='triggers_20240814100816' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 14, 8, 8, 18, 178416))

In [38]:
from datasets import load_dataset

ds = load_dataset("proj-persona/PersonaHub", "instruction")
MAX_RECORDS = 1000
records_to_upload = []
for sample in ds["train"].to_iterable_dataset():
    record=rg.Record(
                fields={
                    "persona": sample["input persona"],
                    "instruction": sample["synthesized text"],
                    "response1": "",
                    "response2": "",
                },
                id=str(hash(sample["synthesized text"])),
            )
    records_to_upload.append(record)
    if len(records_to_upload) == MAX_RECORDS:
        break
dataset.records.log(records=records_to_upload)

Sending records...: 4batch [00:01,  3.31batch/s]                    


DatasetRecords(Dataset(id=UUID('7d528756-d877-4b35-9c29-69acf82a657e') inserted_at=datetime.datetime(2024, 8, 14, 8, 8, 17, 956135) updated_at=datetime.datetime(2024, 8, 14, 8, 8, 18, 178416) name='triggers_20240814100816' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 14, 8, 8, 18, 178416)))

In [39]:
list(
    dataset.records(
        query=rg.Query(filter=rg.Filter(conditions=[("respond.response", "==", "yes")]))
    )
)

[]

In [31]:
from distilabel.llms import InferenceEndpointsLLM
from distilabel.steps.tasks import TextGeneration

import requests


def update_record_fields(record_id, updated_fields):
    url = f"http://localhost:6900/api/v1/records/{record_id}"
    headers = {
        "accept": "application/json",
        "X-Argilla-Api-Key": "argilla.apikey",
        "Content-Type": "application/json",
    }
    data = {"fields": updated_fields}
    response = requests.patch(url, headers=headers, json=data)
    return response.json()


def delete_response(response_id):
    url = f"http://localhost:6900/api/v1/responses/{response_id}"
    headers = {
        "accept": "application/json",
        "X-Argilla-Api-Key": "argilla.apikey",
        "Content-Type": "application/json",
    }
    response = requests.delete(url, headers=headers)
    return response.json()


def respond_to_record(record: rg.Record):
    llama31 = TextGeneration(
        name="text-generation",
        llm=InferenceEndpointsLLM(
            model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
            tokenizer_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
        ),
    )
    llama31.load()

    gemma_tiny = TextGeneration(
        name="text-generation",
        llm=InferenceEndpointsLLM(
            model_id="google/gemma-1.1-7b-it",
            tokenizer_id="google/gemma-1.1-7b-it",
        ),
    )
    gemma_tiny.load()

    responses = []
    for task in [llama31, gemma_tiny]:
        print(task.name)
        output = list(task.process([{"instruction": record.fields["instruction"]}]))[
            0
        ][0]
        generation = output["generation"]
        responses.append(generation)
    return responses

In [32]:
from distilabel.llms import InferenceEndpointsLLM
from distilabel.steps.tasks import UltraFeedback


def add_feedback_suggestions(record, response_1, response_2) -> None:
    ultrafeedback = UltraFeedback(
        aspect="overall-rating",
        llm=InferenceEndpointsLLM(
            model_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
            tokenizer_id="meta-llama/Meta-Llama-3.1-70B-Instruct",
        ),
    )
    ultrafeedback.load()
    response = ultrafeedback.process(
        [
            {
                "instruction": "trivia questions",
                "generations": [
                    response_1,
                    response_2,
                ],
            }
        ],
    )
    response = list(response)[0][0]
    ratings = response["ratings"]
    rationales = response["rationales"]

    for n, (rating, rationale) in enumerate(zip(ratings, rationales)):
        record.suggestions.add(
            suggestion=rg.Suggestion(
                question_name=f"response{n+1}_rating",
                value=rating,
            )
        )
        record.suggestions.add(
            suggestion=rg.Suggestion(
                question_name=f"response{n+1}_rationale",
                value=rationale,
            )
        )

    for response in record.responses["respond"]:
        response.status = "draft"
    return record

In [41]:
def respond_to_good_instructions() -> None:
    updated_records = []
    for record in dataset.records(
        query=rg.Query(filter=rg.Filter(conditions=[("respond.response", "==", "yes")]))
    ):
        response_1, response_2 = respond_to_record(record=record)
        updated_fields = dict(record.fields)
        updated_fields["response1"] = response_1
        updated_fields["response2"] = response_2
        update_record_fields(
            record_id=record._server_id,
            updated_fields=updated_fields,
        )
        updated_record = add_feedback_suggestions(
            record=record, response_1=response_1, response_2=response_2
        )

        updated_records.append(updated_record)
    dataset.records.log(updated_records)


respond_to_good_instructions()

Step 'text-generation' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, for serverless or dedicated inference endpoints, respectively.
Step 'text-generation' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/google/gemma-1.1-7b-it` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, fo

text-generation
text-generation


Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-70B-Instruct` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, for serverless or dedicated inference endpoints, respectively.


Sending records...: 100%|██████████| 1/1 [00:00<00:00, 14.82batch/s]


In [34]:
def get_dataset_progress(dataset_id):
    url = f"http://localhost:6900/api/v1/datasets/{dataset_id}/progress"
    headers = {
        "accept": "application/json",
        "X-Argilla-Api-Key": "argilla.apikey",
        "Content-Type": "application/json",
    }
    response = requests.get(url, headers=headers)
    return response.json()


_completed = 0

while True:
    dataset_progress = get_dataset_progress(dataset_id=dataset.id)
    completed = dataset_progress["completed"]
    if completed > _completed:
        print(f"Completed {completed} records")
        _completed = completed
        respond_to_good_instructions()
    

Step 'text-generation' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, for serverless or dedicated inference endpoints, respectively.


Completed 1 records


Step 'text-generation' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/google/gemma-1.1-7b-it` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, for serverless or dedicated inference endpoints, respectively.


text-generation
text-generation


Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-70B-Instruct` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, for serverless or dedicated inference endpoints, respectively.
Step 'text-generation' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-8B-Instruct` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those arg

text-generation
text-generation


Step 'None' hasn't received a pipeline, and it hasn't been created within a `Pipeline` context. Please, use `with Pipeline() as pipeline:` and create the step within the context.
Since the `base_url=https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3.1-70B-Instruct` is available and either one of `model_id` or `endpoint_name` is also provided, the `base_url` will either be ignored or overwritten with the one generated from either of those args, for serverless or dedicated inference endpoints, respectively.


Sending records...: 100%|██████████| 1/1 [00:00<00:00, 11.51batch/s]
